Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.
library(knitr)
library(reticulate)
#Data wrangling and data viz
library(tidyverse)
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggrepel)
library(ggtext)
library(scatterpie)
library(pheatmap)
library(ggridges)
library(ggtree)
library(tidytree)
library(multcomp)
library(lsmeans)
#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)
#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"
#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"
#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
depth_per_gene_dir = paste0(VAR_dir, "2_Depth_per_gene/")
vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")
#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
effectors_annotation_file = "~/Documents/Postdoc_Eva/Manuscripts/Accepted/Alice_Cecile_Comparative_genomics/Data_for_publication/Annotations_2018_genomes_for_publication.tab"
eggnog_annotation = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.eggnog")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))
Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(TERIP=TE_RIP_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)
Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)
#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
mutate(Filter = "Mutants_etc"))
##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")
## Metadata of genotyped samples
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()
Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"),
col_names = temp, delim = "\t",
na = "\\N", guess_max = 2000) %>%
unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
inner_join(., genotyped_samples) %>%
mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
dplyr::select(ID_file, Continent, Country, Latitude, Longitude, Latitude2, Longitude2,
Sampling_Date, Collection)
temp = read_tsv(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv")) %>%
dplyr::select(ID_file = Sample, Cluster)
Zt_meta = full_join(Zt_meta, temp)
#genotyped_samples %>%
# filter(!(ID_file %in% filtered_samples$ID_file)) %>%
# write_tsv(Zt_list, col_names = F)
#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)
myColors2 <- c("#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba",
"#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
names(myColors2) = levels(factor(Zt_meta$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors2)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors2)
## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3",
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc", "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")
## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)
#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
#Run on the cluster
#Create bed file with 10kb windows
#(including the last window which can be smaller)
while read chr length temp temp2 temp3; do
start=0;
while [ "$start" -le "$length" ] ; do
if [ "$(($start + 10000))" -le "$length" ] ;
then
echo -e "${chr}\t${start}\t$(($start + 10000))" ;
else echo -e "${chr}\t${start}\t$length" ;
fi ;
start=`expr $start + 10000` ;
done ;
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa.fai > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
#GC etc from bedtools nuc
~/Software/bedtools nuc \
-fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
-bed /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab
#RIP per window
#(my script makes a summary for all seq in a multifasta, so I tricked it my giving it a single window at a time)
#columns are "CHROM", "Start", "End", "GC", "Product index", "Substrate index", "Composite"
while read chr start end ;
do
echo -e "${chr}\t${start}\t${end}" > temp.bed ;
~/Software/bedtools getfasta -fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
-bed temp.bed -fo temp.fa ;
python GC_RIP_per_read_fastq.py --input_format fasta --out temp temp.fa ;
values=$(cat temp.txt | ~/Software/datamash-1.3/datamash transpose | grep "Median" | cut -f 2,3,4,5) ;
echo -e "${chr}\t${start}\t${end}\t$values" \
>> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv ;
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
#Count variants
#columns are CHROM, Start(IN 10KB UNITS), Variant_number
zcat /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz | \
grep -v "#" | awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,$2,int($2/10000)}' | \
~/Software/bedtools groupby -g 1,3 -o count -c 2 > \
/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab
#Gene coverage
#columns are CHROM, Start, End, Nb_overlapping_genes, Coverage_bp_gene, Window_length, Coverage_frac_gene
~/Software/bedtools coverage \
-a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
-b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3 \
> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab
#Reference TE coverage
#columns are CHROM, Start, End, Nb_overlapping_TEs, Coverage_bp_TE, Window_length, Coverage_frac_TE
~/Software/bedtools coverage \
-a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
-b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.Badet_Oggenfuss_2019.TE.gtf \
> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab
# Uploading commands from the cluster to my computer.
# TE and RIP
rsync -avP \
alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads* \
~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
# Nb_reads_per_TE.txt
# Nb_reads.txt
rsync -avP \
alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt \
~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/ \
~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/
#Per windows: coordinates, RIP, GC, SNP counts
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/
rsync -avP alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/
Previously, based on the study of TE and RIP in fully-assembled Z.tritici genomes, a hypothesis was drawn. The lower RIP in TEs of European samples, as compared to Iranian isolates, could indicate a loss of RIP in Z.tritici when it spread out of its area of origin. Here, I would like to investigate this possibility in the different pop.
The data plotted here are based on the following steps:
First, I mapped the reads on the TE consensus created by Ursula based on Thomas’s pangenome. I also detected reference andnon-reference TE insertions with ngs_te_mapper2. I visualize the results here in terms of percentage of reads mapping on these consensus and of number of insertions. I try here to see if there are biases between collections, as well as to compare the different TE content estimations that I have used.
#Reading in the data
TE_qty = read_delim(paste0(RIP_DIR, "Nb_reads.txt"), delim = " ") %>%
dplyr::filter(Total_reads > Te_aligned_reads) %>%
dplyr::mutate(Percent_TE_Reads = Te_aligned_reads * 100 / Total_reads) %>%
full_join(., Zt_meta) %>%
unite(Continent, Country, ID_file, col = "for_display", remove = F)
world_avg <-
TE_qty %>%
dplyr::summarize(avg = mean(as.numeric(Percent_TE_Reads), na.rm = T)) %>%
pull(avg)
#Summarize numbers
#_________________
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" -v | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Ref_TE_numbers.txt
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Non-ref_TE_numbers.txt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*_TE_numbers.txt ../4_TE_RIP/
#Gather per strain files into one big file
#_________________________________________
#cd /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.nonref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.nonref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.nonref.bed ; fi ; done > Non-ref_all_strains.bed
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.ref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.ref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.ref.bed ; fi ; done > Ref_all_strains.bed
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*ef_all_strains.bed ../4_TE_RIP/
TE_qty = full_join(read_delim(paste0(TE_RIP_dir, "Non-ref_TE_numbers.txt"),
col_names = c("ID_file", "Non_ref_insertions"),
delim = " "),
read_delim(paste0(TE_RIP_dir, "Ref_TE_numbers.txt"),
col_names = c("ID_file", "Ref_insertions"),
delim = " ")) %>%
mutate(Total_insertions = Ref_insertions + Non_ref_insertions) %>%
filter(Total_insertions > 0) %>%
left_join(TE_qty)
p1 = TE_qty %>%
ggplot(aes(x = Non_ref_insertions, y = Ref_insertions, col = Continent)) +
geom_point(alpha = .8) +
theme_bw() +
Color_Continent +
theme(legend.position = "None")
p2 = TE_qty %>%
ggplot(aes(x = Percent_TE_Reads, y = Total_insertions, col = Continent)) +
geom_point(alpha = .8) +
theme_bw() +
Color_Continent +
theme(legend.position = "None")
p3 = TE_qty %>%
ggplot(aes(x = Percent_TE_Reads, y = Ref_insertions, col = Continent)) +
geom_point(alpha = .8) +
theme_bw() +
Color_Continent +
theme(legend.position = "None")
p4 = TE_qty %>%
ggplot(aes(x = Percent_TE_Reads, y = Non_ref_insertions, col = Continent)) +
geom_point(alpha = .8) +
theme_bw() +
Color_Continent +
theme(legend.position = "None")
cowplot::plot_grid(p1, p2, p3, p4, ncol = 2, nrow = 2)
#Collections comparisons
ggplot(TE_qty, aes(x = Total_insertions, y = Percent_TE_Reads, col = Collection)) +
geom_point(alpha = .8) +
theme_bw() +
labs(title = "Comparison of whole TE content estimation per sample")
p1 = ggplot(TE_qty, aes(x = Percent_TE_Reads, col = Collection, fill = Collection)) +
geom_density(alpha = .4) +
theme_bw()
p2 = ggplot(TE_qty, aes(x = Total_insertions, col = Collection, fill = Collection)) +
geom_density(alpha = .4) +
theme_bw()
comment = ggplot() + theme_void() +
geom_text(aes(x = 1, y = 1, label = "Comparison of whole TE content estimation per collection"),
size = 6)
cowplot::plot_grid(comment,
p1 + theme(legend.position = "None"),
p2 + theme(legend.position = "None"),
get_legend(p1 + theme(legend.position = "bottom")),
ncol = 1, nrow = 4, rel_heights = c(.3, 1, 1, .5))
p1 = ggplot(TE_qty, aes(x = Ref_insertions, col = Collection, fill = Collection)) +
geom_density(alpha = .4) +
theme_bw()
p2 = ggplot(TE_qty, aes(x = Non_ref_insertions, col = Collection, fill = Collection)) +
geom_density(alpha = .4) +
theme_bw()
comment = ggplot() + theme_void() +
geom_text(aes(x = 1, y = 1, label = "Comparison of reference and non-reference insertions numbers"),
size = 6)
cowplot::plot_grid(comment,
p1 + theme(legend.position = "None"),
p2 + theme(legend.position = "None"),
get_legend(p1 + theme(legend.position = "bottom")),
ncol = 1, nrow = 4, rel_heights = c(.3, 1, 1, .5))
temp = TE_qty %>% filter(!is.na(Continent))%>% filter(!is.na(Collection))
model1 = lm(Non_ref_insertions ~ Collection,
data=temp)
summary(model1)
##
## Call:
## lm(formula = Non_ref_insertions ~ Collection, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.176 -20.884 -0.884 19.672 171.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 195.328 3.073 63.555 < 2e-16 ***
## CollectionHartmann_FstQst_2015 6.848 4.372 1.566 0.117689
## CollectionHartmann_Oregon_2016 140.108 4.723 29.665 < 2e-16 ***
## CollectionJGI 18.749 6.360 2.948 0.003287 **
## CollectionJGI_2 2.297 6.872 0.334 0.738293
## CollectionJGI_3 -9.478 8.360 -1.134 0.257251
## CollectionJGI_4 -1.328 20.309 -0.065 0.947874
## CollectionJGI_Thierry -5.468 6.129 -0.892 0.372584
## CollectionMMcDonald_2018 12.937 5.218 2.479 0.013360 *
## CollectionSyngenta 23.556 3.720 6.331 3.97e-10 ***
## CollectionThird_batch_2018_BM_TM -35.016 9.220 -3.798 0.000157 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.77 on 832 degrees of freedom
## Multiple R-squared: 0.6015, Adjusted R-squared: 0.5967
## F-statistic: 125.6 on 10 and 832 DF, p-value: < 2.2e-16
model2 = lm(Ref_insertions ~ Collection,
data=temp)
summary(model2)
##
## Call:
## lm(formula = Ref_insertions ~ Collection, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128.967 -11.800 1.923 13.033 90.442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 183.000 2.005 91.276 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -71.200 2.852 -24.962 < 2e-16 ***
## CollectionHartmann_Oregon_2016 14.457 3.081 4.692 3.16e-06 ***
## CollectionJGI -43.923 4.149 -10.587 < 2e-16 ***
## CollectionJGI_2 -60.000 4.483 -13.384 < 2e-16 ***
## CollectionJGI_3 -44.200 5.454 -8.104 1.89e-15 ***
## CollectionJGI_4 -2.667 13.249 -0.201 0.841
## CollectionJGI_Thierry -33.442 3.998 -8.364 2.53e-16 ***
## CollectionMMcDonald_2018 2.824 3.404 0.830 0.407
## CollectionSyngenta 109.967 2.427 45.309 < 2e-16 ***
## CollectionThird_batch_2018_BM_TM -72.813 6.015 -12.106 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.68 on 832 degrees of freedom
## Multiple R-squared: 0.9043, Adjusted R-squared: 0.9032
## F-statistic: 786.6 on 10 and 832 DF, p-value: < 2.2e-16
model3 = lm(Percent_TE_Reads ~ Collection,
data=temp)
summary(model3)
##
## Call:
## lm(formula = Percent_TE_Reads ~ Collection, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9485 -0.9017 -0.0092 0.9203 11.9850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.1347 0.1699 89.067 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -5.4028 0.2423 -22.299 < 2e-16 ***
## CollectionHartmann_Oregon_2016 2.9589 0.2626 11.266 < 2e-16 ***
## CollectionJGI -0.3348 0.3485 -0.961 0.337
## CollectionJGI_2 -7.3423 0.3764 -19.508 < 2e-16 ***
## CollectionJGI_3 2.5326 0.4575 5.535 4.19e-08 ***
## CollectionJGI_4 1.5202 1.1099 1.370 0.171
## CollectionJGI_Thierry 1.6168 0.3359 4.814 1.77e-06 ***
## CollectionMMcDonald_2018 4.0871 0.2905 14.068 < 2e-16 ***
## CollectionSyngenta -1.5919 0.2049 -7.768 2.40e-14 ***
## CollectionThird_batch_2018_BM_TM -0.6011 0.5536 -1.086 0.278
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.9 on 815 degrees of freedom
## (17 observations deleted due to missingness)
## Multiple R-squared: 0.7185, Adjusted R-squared: 0.715
## F-statistic: 208 on 10 and 815 DF, p-value: < 2.2e-16
Comparing the 3 models, it is clear that the reference insertions estimation is the most biased of the 3 methods (when taking the sequencing batch into account). The non-reference insertions does not look too bad, until we add the recent Oregon population.
I now want to see if adding the continent information on top of the collection one to the model brings a significant improvement or not.
model11 = lm(Non_ref_insertions ~ Collection + Continent,
data=temp)
summary(model11)
##
## Call:
## lm(formula = Non_ref_insertions ~ Collection + Continent, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.673 -15.861 -0.884 13.735 171.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164.4625 4.4906 36.624 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -11.5345 3.6058 -3.199 0.001432 **
## CollectionHartmann_Oregon_2016 87.2836 4.0142 21.744 < 2e-16 ***
## CollectionJGI 3.8918 4.7256 0.824 0.410422
## CollectionJGI_2 -3.4800 5.1701 -0.673 0.501074
## CollectionJGI_3 14.8525 6.3100 2.354 0.018817 *
## CollectionJGI_4 -0.9057 14.8009 -0.061 0.951222
## CollectionJGI_Thierry 7.8013 4.5485 1.715 0.086696 .
## CollectionMMcDonald_2018 -19.3695 6.5032 -2.978 0.002982 **
## CollectionSyngenta 23.9780 4.1834 5.732 1.39e-08 ***
## CollectionThird_batch_2018_BM_TM -35.7658 7.0670 -5.061 5.14e-07 ***
## ContinentAsia 30.3033 25.9175 1.169 0.242651
## ContinentEurope 30.4432 5.1348 5.929 4.48e-09 ***
## ContinentMiddle East -17.3732 5.1923 -3.346 0.000857 ***
## ContinentNorth America 83.6901 4.9982 16.744 < 2e-16 ***
## ContinentOceania 63.1717 6.8577 9.212 < 2e-16 ***
## ContinentSouth America 29.1426 5.4751 5.323 1.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.73 on 826 degrees of freedom
## Multiple R-squared: 0.8, Adjusted R-squared: 0.7961
## F-statistic: 206.4 on 16 and 826 DF, p-value: < 2.2e-16
model21 = lm(Ref_insertions ~ Collection + Continent,
data=temp)
summary(model21)
##
## Call:
## lm(formula = Ref_insertions ~ Collection + Continent, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128.967 -10.457 0.543 12.033 78.458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 165.826 3.912 42.386 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -78.048 3.141 -24.845 < 2e-16 ***
## CollectionHartmann_Oregon_2016 4.558 3.497 1.303 0.193
## CollectionJGI -52.176 4.117 -12.673 < 2e-16 ***
## CollectionJGI_2 -67.136 4.504 -14.905 < 2e-16 ***
## CollectionJGI_3 -47.190 5.497 -8.584 < 2e-16 ***
## CollectionJGI_4 -18.683 12.895 -1.449 0.148
## CollectionJGI_Thierry -35.036 3.963 -8.841 < 2e-16 ***
## CollectionMMcDonald_2018 -6.196 5.666 -1.094 0.274
## CollectionSyngenta 93.951 3.645 25.778 < 2e-16 ***
## CollectionThird_batch_2018_BM_TM -83.241 6.157 -13.520 < 2e-16 ***
## ContinentAsia 17.415 22.580 0.771 0.441
## ContinentEurope 33.190 4.474 7.419 2.93e-13 ***
## ContinentMiddle East 7.138 4.524 1.578 0.115
## ContinentNorth America 27.074 4.355 6.217 8.02e-10 ***
## ContinentOceania 26.194 5.975 4.384 1.31e-05 ***
## ContinentSouth America 21.752 4.770 4.560 5.89e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.54 on 826 degrees of freedom
## Multiple R-squared: 0.9143, Adjusted R-squared: 0.9127
## F-statistic: 551.1 on 16 and 826 DF, p-value: < 2.2e-16
model31 = lm(Percent_TE_Reads ~ Collection + Continent,
data=temp)
summary(model31)
##
## Call:
## lm(formula = Percent_TE_Reads ~ Collection + Continent, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9485 -0.9033 0.0001 0.8436 11.5606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.2015 0.3352 45.346 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -6.0746 0.2685 -22.623 < 2e-16 ***
## CollectionHartmann_Oregon_2016 2.4598 0.2981 8.253 6.26e-16 ***
## CollectionJGI -0.7569 0.3480 -2.175 0.029911 *
## CollectionJGI_2 -8.0273 0.3812 -21.057 < 2e-16 ***
## CollectionJGI_3 2.6711 0.4643 5.753 1.24e-08 ***
## CollectionJGI_4 0.6738 1.0868 0.620 0.535471
## CollectionJGI_Thierry 1.4422 0.3355 4.298 1.93e-05 ***
## CollectionMMcDonald_2018 1.6677 0.4814 3.464 0.000559 ***
## CollectionSyngenta -2.4384 0.3118 -7.820 1.65e-14 ***
## CollectionThird_batch_2018_BM_TM -1.2122 0.5653 -2.144 0.032314 *
## ContinentAsia 0.4002 1.9140 0.209 0.834415
## ContinentEurope 0.7796 0.3818 2.042 0.041475 *
## ContinentMiddle East -1.1902 0.3852 -3.090 0.002070 **
## ContinentNorth America 0.4323 0.3709 1.165 0.244222
## ContinentOceania 2.3526 0.5067 4.643 4.01e-06 ***
## ContinentSouth America 0.2622 0.4069 0.644 0.519539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.813 on 809 degrees of freedom
## (17 observations deleted due to missingness)
## Multiple R-squared: 0.7455, Adjusted R-squared: 0.7404
## F-statistic: 148.1 on 16 and 809 DF, p-value: < 2.2e-16
#Comparison of simple vs complex models
anova(model1, model11)
## Analysis of Variance Table
##
## Model 1: Non_ref_insertions ~ Collection
## Model 2: Non_ref_insertions ~ Collection + Continent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 832 1005904
## 2 826 504990 6 500914 136.56 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model21)
## Analysis of Variance Table
##
## Model 1: Ref_insertions ~ Collection
## Model 2: Ref_insertions ~ Collection + Continent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 832 428076
## 2 826 383300 6 44775 16.082 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model3, model31)
## Analysis of Variance Table
##
## Model 1: Percent_TE_Reads ~ Collection
## Model 2: Percent_TE_Reads ~ Collection + Continent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 815 2941.6
## 2 809 2659.7 6 281.95 14.293 1.581e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
temp = TE_qty %>% filter(!is.na(Continent))%>% filter(!is.na(Collection)) %>% filter(!is.na(Cluster))
model11 = lm(Non_ref_insertions ~ Collection + Continent,
data=temp)
model31 = lm(Percent_TE_Reads ~ Collection + Continent,
data=temp)
model12 = lm(Non_ref_insertions ~ Collection + Continent + Cluster,
data=temp)
summary(model12)
##
## Call:
## lm(formula = Non_ref_insertions ~ Collection + Continent + Cluster,
## data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.51 -14.13 -0.91 12.62 170.90
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148.2760 18.1001 8.192 1.16e-15 ***
## CollectionHartmann_FstQst_2015 -16.0396 3.8237 -4.195 3.07e-05 ***
## CollectionHartmann_Oregon_2016 80.2749 4.2556 18.863 < 2e-16 ***
## CollectionJGI -1.3127 5.0295 -0.261 0.794165
## CollectionJGI_2 4.4347 5.7895 0.766 0.443936
## CollectionJGI_3 10.8945 7.3431 1.484 0.138340
## CollectionJGI_4 15.1077 16.6859 0.905 0.365545
## CollectionJGI_Thierry 7.9616 5.4045 1.473 0.141151
## CollectionMMcDonald_2018 -39.5282 24.8236 -1.592 0.111740
## CollectionSyngenta 15.7117 4.3633 3.601 0.000339 ***
## CollectionThird_batch_2018_BM_TM -43.7468 7.0535 -6.202 9.36e-10 ***
## ContinentEurope 6.3383 8.9825 0.706 0.480645
## ContinentMiddle East 5.9472 16.9098 0.352 0.725163
## ContinentNorth America 106.6342 18.8080 5.670 2.07e-08 ***
## ContinentOceania 102.0648 30.2146 3.378 0.000769 ***
## ContinentSouth America 59.2768 18.9583 3.127 0.001839 **
## ClusterV10 0.2511 6.2934 0.040 0.968184
## ClusterV11 1.7960 17.5194 0.103 0.918377
## ClusterV2 48.7780 19.3669 2.519 0.011996 *
## ClusterV3 -5.9093 7.0367 -0.840 0.401309
## ClusterV4 -15.5975 23.5499 -0.662 0.507980
## ClusterV5 -18.3211 7.0810 -2.587 0.009865 **
## ClusterV6 -13.0975 7.5300 -1.739 0.082392 .
## ClusterV7 NA NA NA NA
## ClusterV8 NA NA NA NA
## ClusterV9 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.86 on 724 degrees of freedom
## Multiple R-squared: 0.8366, Adjusted R-squared: 0.8317
## F-statistic: 168.5 on 22 and 724 DF, p-value: < 2.2e-16
model32 = lm(Percent_TE_Reads ~ Collection + Continent + Cluster,
data=temp)
summary(model32)
##
## Call:
## lm(formula = Percent_TE_Reads ~ Collection + Continent + Cluster,
## data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9540 -0.8384 -0.0147 0.8687 11.0786
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.7255 1.3803 9.220 < 2e-16 ***
## CollectionHartmann_FstQst_2015 -6.3569 0.2951 -21.541 < 2e-16 ***
## CollectionHartmann_Oregon_2016 1.9487 0.3277 5.946 4.31e-09 ***
## CollectionJGI -0.5999 0.3849 -1.559 0.11952
## CollectionJGI_2 -7.7921 0.4435 -17.569 < 2e-16 ***
## CollectionJGI_3 2.5291 0.5612 4.506 7.72e-06 ***
## CollectionJGI_4 0.8124 1.2720 0.639 0.52322
## CollectionJGI_Thierry 1.2944 0.4143 3.124 0.00186 **
## CollectionMMcDonald_2018 -2.3335 1.8900 -1.235 0.21737
## CollectionSyngenta -2.7579 0.3392 -8.131 1.90e-15 ***
## CollectionThird_batch_2018_BM_TM -1.2967 0.5728 -2.264 0.02388 *
## ContinentEurope 1.5390 0.6865 2.242 0.02528 *
## ContinentMiddle East 0.8308 1.2894 0.644 0.51955
## ContinentNorth America 2.2689 1.4335 1.583 0.11393
## ContinentOceania 10.8451 2.3014 4.713 2.95e-06 ***
## ContinentSouth America 2.6608 1.4453 1.841 0.06603 .
## ClusterV10 1.1505 0.4796 2.399 0.01670 *
## ClusterV11 1.9641 1.3336 1.473 0.14128
## ClusterV2 2.0416 1.4744 1.385 0.16658
## ClusterV3 -3.0521 0.5419 -5.632 2.56e-08 ***
## ClusterV4 -5.6477 1.7927 -3.150 0.00170 **
## ClusterV5 0.1073 0.5425 0.198 0.84319
## ClusterV6 0.1653 0.5746 0.288 0.77373
## ClusterV7 NA NA NA NA
## ClusterV8 NA NA NA NA
## ClusterV9 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.74 on 709 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.7593, Adjusted R-squared: 0.7518
## F-statistic: 101.6 on 22 and 709 DF, p-value: < 2.2e-16
anova(model11, model12)
## Analysis of Variance Table
##
## Model 1: Non_ref_insertions ~ Collection + Continent
## Model 2: Non_ref_insertions ~ Collection + Continent + Cluster
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 731 400731
## 2 724 378320 7 22412 6.1271 5.656e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model31, model32)
## Analysis of Variance Table
##
## Model 1: Percent_TE_Reads ~ Collection + Continent
## Model 2: Percent_TE_Reads ~ Collection + Continent + Cluster
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 716 2305.6
## 2 709 2146.7 7 158.92 7.4981 1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The fit is indeed improved by adding the collection information, with both TE quantity estimations.
depth = read_tsv(paste0(vcf_dir, "Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.idepth"))
GC = read_delim(paste0(RIP_DIR, "GC_percent.txt"), col_names = c("ID_file", "TE", "Estimate", "Median_GC", "Mean_GC"), delim = " ") %>%
dplyr::select(-TE, -Estimate)
biases = inner_join(TE_qty, depth, by = c("ID_file" = "INDV")) %>%
inner_join(GC)
p1 = biases %>%
ggplot(aes(x = Mean_GC, y = Percent_TE_Reads, col = Collection)) +
geom_point(alpha = .5) +
theme_bw()
p2 = biases %>%
ggplot(aes(x = Mean_GC, y = Total_insertions, col = Collection)) +
geom_point(alpha = .5) +
theme_bw()
p3 = biases %>%
ggplot(aes(x = MEAN_DEPTH, y = Percent_TE_Reads, col = Collection)) +
geom_point(alpha = .5) +
theme_bw()
p4 = biases %>%
ggplot(aes(x = MEAN_DEPTH, y = Total_insertions, col = Collection)) +
geom_point(alpha = .5) +
theme_bw()
bloc = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2+ theme(legend.position = "none"))
cowplot::plot_grid(bloc, get_legend(p3 + theme(legend.position = "bottom")),
rel_heights = c(1, .2), ncol = 1)
bloc = cowplot::plot_grid(p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"))
cowplot::plot_grid(bloc, get_legend(p3 + theme(legend.position = "bottom")),
rel_heights = c(1, .2), ncol = 1)
p1 = biases %>%
filter(MEAN_DEPTH < 80) %>%
ggplot(aes(x = MEAN_DEPTH, y = Total_insertions)) +
geom_point(alpha = .5) +
theme_bw() +
geom_smooth(method = "lm")
p2 = biases %>%
filter(MEAN_DEPTH < 25) %>%
ggplot(aes(x = MEAN_DEPTH, y = Total_insertions)) +
geom_point(alpha = .5) +
theme_bw() +
geom_smooth(method = "lm")
bloc = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2+ theme(legend.position = "none"))
titre = ggplot() + geom_text(aes(x = 1, y = 1, label = "Depth bias is mostly found below 20 or 25."), size = 6) + theme_void()
cowplot::plot_grid(titre, bloc, ncol = 1, rel_heights = c(0.1, 1))
model = lm(Percent_TE_Reads ~ Mean_GC + MEAN_DEPTH, data = filter(biases, MEAN_DEPTH < 25))
summary(model)
##
## Call:
## lm(formula = Percent_TE_Reads ~ Mean_GC + MEAN_DEPTH, data = filter(biases,
## MEAN_DEPTH < 25))
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.3611 -1.7650 0.6579 2.1907 10.4482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.67655 3.69856 11.268 <2e-16 ***
## Mean_GC -0.72337 0.07950 -9.099 <2e-16 ***
## MEAN_DEPTH 0.36730 0.03447 10.655 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.572 on 513 degrees of freedom
## (19 observations deleted due to missingness)
## Multiple R-squared: 0.2789, Adjusted R-squared: 0.2761
## F-statistic: 99.2 on 2 and 513 DF, p-value: < 2.2e-16
model = lm(Total_insertions ~ Mean_GC + MEAN_DEPTH, data = filter(biases, MEAN_DEPTH < 25))
round(summary(model)$coefficients, 4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.5883 74.9782 2.3552 0.0189
## Mean_GC 0.4096 1.6119 0.2541 0.7995
## MEAN_DEPTH 11.5581 0.6957 16.6134 0.0000
rsq = round(summary(model)$r.squared, 2)
temp = biases %>%
filter(MEAN_DEPTH < 25) %>%
pivot_longer(cols = c(Total_insertions, Percent_TE_Reads), names_to = "Method", values_to = "TE_estimate")
y_values = summarize(temp, max(TE_estimate))
ggplot(temp, aes(x = MEAN_DEPTH, y = TE_estimate)) +
geom_point(alpha = .5) +
theme_bw() +
geom_smooth(method = "lm") +
facet_wrap(vars(Method), scales = "free")
The non reference TIPs have some sort of support estimation. As a first step, I want to filter the ones that don’t seem reliable.
TE_insertions = bind_rows(read_tsv(paste0(TE_RIP_dir, "Non-ref_all_strains.bed"),
col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand")) %>%
mutate(ref_non_ref_TIP = "non_ref"),
read_tsv(paste0(TE_RIP_dir, "Ref_all_strains.bed"),
col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand")) %>%
mutate(ref_non_ref_TIP = "ref")) %>%
separate(col = chromosome, into = c("X1", "CHR", "X2", "X3", "X4"), sep = "_") %>%
separate(col = info, into = c("TE_family", "TSD", "Allele_Frequency", "3_support", "5_support", "ref_reads"), sep = "\\|") %>%
dplyr::select(-c(X1, X2, X3, X4, score, strand)) %>%
left_join(Zt_meta %>% dplyr::select(ID_file, Continent, Cluster, Sampling_Date, Collection)) %>%
filter(!is.na(Continent)) %>% filter(Continent != "Asia") %>%
mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"))
threshold = 0.9
subset_nb = 10000
subset = slice_sample(TE_insertions, n = subset_nb)
nb_non_filtered = nrow(TE_insertions)
nb_filtered = nrow(filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold))
temp = nrow(filter(subset, as.numeric(Allele_Frequency) < threshold))
ggplot(subset, aes(as.numeric(Allele_Frequency))) +
geom_density() +
geom_vline(aes(xintercept = threshold), color = "#82C0CC") +
geom_label(x = 0.5, y = 100, fill = "white",
label = str_wrap(paste0((100*temp/subset_nb), "% TE with allelic frequency below ", threshold,
". \n The dataset goes from ", nb_non_filtered, " detected_TE_insertions to ",
nb_filtered, " after filtering."), 70)) +
theme_bw() +
labs(x = "Allele Frequency of the TIPs PAV for each isolate",
title = "Threshold for non-reference TIP filtering")
TE_insertions = filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold)
Once, this is done, I want to explore some basic statistics about the TIPs. In many other studies, it seems that the TIPs SFS is biased toward lower frequency. As shown above, we have a depth bias here, so it is likely that this bias, if found in Z. tritici would be made even stronger by a non-detection artefact in low depth genomes. So in that context, here are some questions of interest: What are the frequency of the TE insertions we found? Are they shared by several samples?
TE_insertions_counts = TE_insertions %>%
unite(Continent, ID_file, col = "for_display", remove = F) %>%
dplyr::count(TE_insertion, ref_non_ref_TIP)
# SFS
cowplot::plot_grid(ggplot() + theme_void() +
geom_text(aes(x = 1, y = 1, label = "Most TE insertions are found at low frequency."),
size = 6),
ggplot(TE_insertions_counts, aes(x = n)) +
geom_density( col = "#2EC4B6", fill = "#CBF3F0") + theme_bw() +
labs(x = "TE insertion count"),
TE_insertions_counts %>%
filter(n > 50) %>%
ggplot(aes(x = n)) + geom_density(col = "#2EC4B6", fill = "#CBF3F0") + theme_bw()+
labs(x = "TE insertion count (above 11)"),
rel_heights = c(0.4, 1, 1), ncol = 1)
# Doughnut chart of frequencies
TE_insertions_counts %>%
mutate(for_bar = case_when(n == 1 ~ "01",
n == 2 ~ "02",
n <= 10 ~ "10",
n <= 100 ~ "100",
n >= 100 ~ "100 +")) %>%
ggplot(aes(x = 1, fill=for_bar)) +
geom_bar(position = "fill") +
theme_bw() +
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
labs(fill = "Number of strains",
title = "Three quarter of all TE insertions are singletons.")
data = TE_insertions_counts %>%
mutate(for_bar = case_when(n == 1 ~ "01",
n == 2 ~ "02",
n <= 10 ~ "10",
n <= 100 ~ "100",
n >= 100 ~ "100 +")) %>%
dplyr::count(for_bar)
data$fraction = data$n / sum(data$n)
data$ymax = cumsum(data$fraction) # Compute the cumulative percentages (top of each rectangle)
data$ymin = c(0, head(data$ymax, n=-1)) # Compute the bottom of each rectangle
data$labelPosition <- (data$ymax + data$ymin) / 2
data$label1 <- ifelse(round(data$fraction*100) > 2, paste0(round(data$fraction*100), "%"), "")
data$label2 <- ifelse(round(data$fraction*100) > 2, "", paste0(round(data$fraction*100), "%"))
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=for_bar)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(2,4)) +
theme_void() +
geom_text( x=3.5, aes(y=labelPosition, label=label1), size=4) +
geom_text( x=4.2, aes(y=labelPosition, label=label2), size=4) +
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
labs(fill = "Number of strains",
title = "Three quarter of all TE insertions are singletons.")
temp = TE_insertions_counts %>%
separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
Length = abs(as.numeric(Start) - as.numeric(End)),
TIP_nb = n) %>%
filter(Length < 2000, ref_non_ref_TIP == "ref") %>%
mutate(for_bar = case_when(TIP_nb == 1 ~ "01",
TIP_nb == 2 ~ "02",
TIP_nb <= 10 ~ "10",
TIP_nb <= 100 ~ "100",
TIP_nb >= 100 ~ "100 +"))
ggplot(temp, aes(x = Length, y = for_bar, fill = for_bar)) +
geom_density_ridges(alpha = .9) +
theme_bw() +
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
labs(x = "TIP frequency category", y = "Length of the TIP",
title = "Length of the reference TIPs per frequency of the TIP")
Although most TIPs are found a low to very low frequency in our analysis, there are some that are identified in several samples. Among the TE insertions which are found multiple times, are the TIPs shared by isolates from different clusters or are they always from a single cluster?
theshold = 5
#Filtering only insertions found more than 5 times
insertions_per_pop = TE_insertions %>%
filter(!is.na(Cluster)) %>%
dplyr::count(Cluster, TE_insertion, ref_non_ref_TIP) %>%
filter(n > theshold) %>%
pivot_wider(names_from = Cluster, values_from = n) %>%
mutate(n = 1) %>%
dplyr::rowwise() %>%
dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))),
Nb_population = 11 - NA_count)
#Number of TIPS shared by populations
temp2 = insertions_per_pop %>%
dplyr::count(Nb_population, name = "Insertion_number") %>%
mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
label = ifelse(Nb_population == 1, paste0("N=", Insertion_number),
ifelse(Nb_population == 11, paste0("N=", Insertion_number), "")))
prop_non_spe = round(sum(temp2$Insertion_number[temp2$Insertion_type != "Population specific"])/sum(temp2$Insertion_number)*100)
ggplot(temp2, aes(x = reorder(as.character(Nb_population), Nb_population),
y = Insertion_number, fill = Insertion_type)) +
geom_bar(stat = "identity") +
geom_text(aes(y = Insertion_number + 30, label = label), size=3) +
geom_label(x = 7, y = max(temp2$Insertion_number)/2,
label = str_wrap(paste0(prop_non_spe, "% of insertions are found in two or more populations"), 20),
fill = "white") +
theme_bw() +
scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
labs(x = "Number of populations displaying an insertion",
y = "Number of TE insertions",
title = str_wrap(paste0("Two thirds of TE insertions found in more than ",
theshold, " isolates are population-specific."),55))
#Numbers of TIPS shared by population: ref vs non-ref
p1 = insertions_per_pop %>%
dplyr::count(Nb_population, name = "Insertion_number") %>%
mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
label = ifelse(Nb_population == 1, paste0("N=", Insertion_number),
ifelse(Nb_population == 11, paste0("N=", Insertion_number), ""))) %>%
ggplot(aes(x = reorder(as.character(Nb_population), Nb_population),
y = Insertion_number, fill = Insertion_type)) +
geom_bar(stat = "identity") +
geom_text(aes(y = Insertion_number + 30, label = label), size=3) +
theme_bw() +
scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
labs(x = "Number of populations displaying an insertion",
y = "Number of reference TIP")
p2 = insertions_per_pop %>%
filter(ref_non_ref_TIP != "ref") %>%
dplyr::count(Nb_population, name = "Insertion_number") %>%
mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
label = ifelse(Nb_population == 1, paste0("N=", Insertion_number),
ifelse(Nb_population == 11, paste0("N=", Insertion_number), ""))) %>%
ggplot(aes(x = reorder(as.character(Nb_population), Nb_population),
y = -Insertion_number, fill = Insertion_type)) +
geom_bar(stat = "identity") +
geom_text(aes(y = -Insertion_number - 30, label = label), size=3) +
theme_bw() +
scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
labs(x = "Number of populations displaying an insertion",
y = "Number of non-reference TIP")
cowplot::plot_grid(p2 + coord_flip() + theme(legend.position = "none",
plot.margin = unit(c(0, 0, 0, 0), "cm")),
p1 + coord_flip() + theme(legend.position = "none",
plot.margin = unit(c(0, 0, 0, 0), "cm"),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
align = "hv")
non_pop_spe = insertions_per_pop %>% filter(NA_count < 10)
nb_per_cluster = dplyr::count(Zt_meta, Cluster, name = "Nb_in_cluster")
#Number of population-specific TIPs per population
insertions_per_pop %>%
filter(Nb_population == 1) %>%
dplyr::select(-NA_count, -Nb_population) %>%
pivot_longer(cols = -c(TE_insertion, n, ref_non_ref_TIP), names_to = "Cluster", values_to = "Nb_present") %>%
filter(!is.na(Nb_present)) %>%
dplyr::count(Cluster, name = "Nb_cluster_specific") %>%
full_join(nb_per_cluster) %>%
ggplot(aes(x = Nb_in_cluster, y = Nb_cluster_specific, col = Cluster, label = Cluster)) +
geom_point() +
geom_label_repel(box.padding = 0.35, point.padding = 0.5,
segment.color = 'grey50') +
theme_bw() +
Color_Cluster +
labs(x = "Number of isolate per population", y = "Number of population-specific insertion",
title = "More TE insertions are detected in heavily sampled populations.")
So far, I’ve looked at TIPs from all TE categories together. I’m curious to see how it matches with the superfamilies and orders of TEs.
temp2 = insertions_per_pop %>%
mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
ifelse(Nb_population == 11, "Shared by all populations", "Intermediate"))) %>%
dplyr::select(TE_insertion, Insertion_type)
#TE insertions per Superfamily and frequency
full_join(TE_insertions_counts, temp2) %>%
separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
Length = abs(as.numeric(Start) - as.numeric(End)),
TIP_nb = n) %>%
dplyr::count(Order, Superfamily, Insertion_type, TIP_nb) %>%
mutate(for_bar = case_when(TIP_nb == 1 ~ "01",
TIP_nb == 2 ~ "02",
TIP_nb <= 10 ~ "10",
TIP_nb <= 100 ~ "100",
TIP_nb >= 100 ~ "100 +")) %>%
ggplot(aes(x = Superfamily, y = n, fill = for_bar)) +
geom_bar(stat= "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
facet_wrap(vars(Order), scales = "free_x")+
scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E"))
#TE insertions per Superfamily and ref/non-ref
#The insertions per pop were filtered for only TIP higher than 5, so temp2 and the following plot as well
right_join(TE_insertions_counts, temp2) %>%
separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
Length = abs(as.numeric(Start) - as.numeric(End))) %>%
dplyr::count(Order, Superfamily, Insertion_type, ref_non_ref_TIP) %>%
ggplot(aes(x = Superfamily, y = n, fill = Insertion_type)) +
geom_bar(stat= "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
facet_wrap(vars(Order, ref_non_ref_TIP), scales = "free")+
scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6"))
Let’s compare the TE content per population and per continent. First, I will make the comparison using the read mapping method.
### Plot
#Building the basic violin plot per continent
#Continents
model = lm(Percent_TE_Reads ~ Continent + Collection,
data=TE_qty %>% filter(!is.na(Continent), !is.na(Cluster)))
CLD = cld(lsmeans(model, ~ Continent), alpha = 0.05, Letters = letters, adjust = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
TE_prop = ggplot(TE_qty %>% filter(!is.na(Continent), !is.na(Cluster))) +
geom_hline(aes(yintercept = world_avg),
color = "gray30", size = 0.6, linetype = "dashed") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Percentage of reads", width = 30),
subtitle = str_wrap(paste(""), width = 70)) +
geom_text(data = CLD, aes(x = Continent, label = .group, y = 34), color = "black")+
geom_violin(aes(x = Continent, y = Percent_TE_Reads, fill = Continent), alpha = .8) +
Fill_Continent + Color_Continent +
stat_summary(aes(x = Continent, y = Percent_TE_Reads), fun = mean, geom = "point", size = 2, color = "grey30") +
labs(title = "Amount of reads mapping on TE consensus per continent")
TE_prop
#Clusters
model = lm(Percent_TE_Reads ~ Cluster + Collection, data=TE_qty)
CLD = cld(lsmeans(model, ~ Cluster), alpha = 0.05, Letters = letters, adjust = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
TE_qty %>%
filter(!is.na(Continent), !is.na(Cluster)) %>%
ggplot() +
geom_hline(aes(yintercept = world_avg),
color = "gray30", size = 0.6, linetype = "dashed") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Percentage of reads", width = 30),
subtitle = str_wrap(paste(""), width = 70)) +
geom_text(data = CLD, aes(x = Cluster, label = .group, y = 34), color = "black")+
geom_violin(aes(x = Cluster, y = Percent_TE_Reads, fill = Cluster), alpha = .8) +
stat_summary(aes(x = Cluster, y = Percent_TE_Reads), fun = mean, geom = "point", size = 2, color = "grey30") +
labs(title = "Amount of reads mapping on TE consensus per cluster") +
Fill_Cluster
The statistics used here are a one-way ANOVA with block. Blocks are used in an analysis of variance or similar models in order to account for suspected variation from factors other than the treatments or main independent variables being investigated. Here I considered the collection as the confounding factor. It definitely has an effect and was thus accounted for in the statistics related to TE content and to RIP level.
Genomes from isolates in Oceania and the Americas seem to contain more TE than those from the Middle-East, in particular.
Let’s see if the same result is obtained with the TE insertions as detected by ngs_te_mapper2.
TE_insertion_wrld_average = TE_qty %>%
dplyr::summarize(avg = mean(as.numeric(Total_insertions), na.rm = T)) %>%
pull(avg)
#Continents
model = lm(Total_insertions ~ Continent + Collection, data=TE_qty)
CLD = cld(lsmeans(model, ~ Continent), alpha = 0.05, Letters = letters, adjust = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
TE_qty %>%
filter(!is.na(Continent)) %>%
ggplot(aes(x = Continent, y = Total_insertions, fill = Continent)) +
geom_violin(alpha = .8) +
geom_text(data = CLD, aes(x = Continent, label = .group, y = 750), color = "black") +
Fill_Continent + Color_Continent +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
title = "TE insertions per continent",
subtitle = str_wrap(paste(""), width = 70)) +
stat_summary(aes(x = Continent, y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")
#Cluster
model = lm(Total_insertions ~ Cluster, data=TE_qty)
CLD = cld(lsmeans(model, ~ Cluster), alpha = 0.05, Letters = letters, adjust = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
TE_qty %>%
filter(!is.na(Cluster)) %>%
ggplot(aes(x = Cluster, y = Total_insertions, fill = Cluster)) +
geom_violin(alpha = .8) +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
title = "TE insertions per cluster",
subtitle = str_wrap(paste(""), width = 70)) +
geom_text(data = CLD, aes(x = Cluster, label = .group, y = 750), color = "black") +
Fill_Cluster +
stat_summary(aes(x = Cluster, y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")
The pattern is somewhat different with the TE insertions. In this case, the Oceania samples don’t look particularly high. However, the pattern with the African and Middle-Eastern samples being lower is even clearer.
ngs_te_mapper2 distinguishes between reference and non-reference insertions. I am curious to see how they differ, so I create the same plot but with the two values separately.
p1 = TE_qty %>%
filter(!is.na(Continent) & Continent != "Asia") %>%
ggplot(aes(x = Continent, y = Non_ref_insertions, fill = Continent)) +
geom_violin(alpha = .8) +
Fill_Continent + Color_Continent +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Non-reference insertions", width = 30))
p2 = TE_qty %>%
filter(!is.na(Continent) & Continent != "Asia") %>%
ggplot(aes(x = Continent, y = Ref_insertions, fill = Continent)) +
geom_violin(alpha = .8) +
Fill_Continent + Color_Continent +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Reference insertions", width = 30))
cowplot::plot_grid(p1 + coord_flip() + theme(legend.position = "none"),
p2 + coord_flip(), rel_widths = c(1, 1.9))
Aside from the repeating pattern of low TE content in the Middle-East/Africa, the most striking difference is the European samples containing more reference insertions. Considering, that the reference genome is a sample isolated from Denmark, it is expected that other European samples would be closer.
I know for sure that my estimates are biased, either because of depth, GC content, or both. So I want to make some more checks to ensure that I am confident in the results I show. First, I want to try and compare the clusters intra-collection to ensure that the results observed previously can be confirmed within each collection.
temp = TE_qty %>%
filter(Cluster != "NA") %>%
dplyr::count(Collection, Cluster) %>%
mutate(n = ifelse(n < 5, NA, n)) %>%
pivot_wider(names_from = Cluster, values_from = n)%>%
rowwise() %>%
dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))))
temp
## # A tibble: 12 x 13
## # Rowwise:
## Collection V10 V11 V2 V5 V6 V7 V8 V9 V4 V1 V3
## <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 ETHZ_2020 24 14 NA 22 14 7 8 12 NA NA NA
## 2 Hartmann_F… 50 NA 22 NA 25 NA NA NA 27 NA NA
## 3 Hartmann_O… 94 NA NA NA NA NA NA NA NA NA NA
## 4 JGI 6 NA 13 NA NA NA 10 NA NA NA NA
## 5 JGI_2 NA NA 11 NA NA NA NA NA NA NA NA
## 6 JGI_3 NA NA 8 NA NA 6 NA NA NA NA NA
## 7 JGI_4 NA NA NA NA NA NA NA NA NA NA NA
## 8 JGI_Thierry NA 7 11 NA NA NA NA NA NA NA NA
## 9 MMcDonald_… NA NA NA NA NA NA NA NA NA 16 31
## 10 Syngenta NA NA 267 NA NA NA NA NA NA NA NA
## 11 Third_batc… NA NA 8 NA NA NA NA NA NA NA NA
## 12 <NA> NA NA NA NA NA NA NA NA NA NA NA
## # … with 1 more variable: NA_count <int>
collections_multiple = temp %>%
filter(NA_count <= 9) %>%
pivot_longer(cols = -c(Collection, NA_count), names_to = "Cluster", values_to = "n") %>%
filter(!is.na(n))
inner_join(TE_qty, dplyr::select(collections_multiple, Collection, Cluster)) %>%
ggplot() +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Percentage of reads", width = 30),
subtitle = str_wrap(paste(""), width = 70)) +
geom_boxplot(aes(x = Cluster, y = Percent_TE_Reads, fill = Cluster), alpha = .8) +
Fill_Cluster + Color_Cluster +
labs(title = "Amount of reads mapping on TE consensus per collection per cluster") +
facet_wrap(vars(Collection), scales = "free")
inner_join(TE_qty, dplyr::select(collections_multiple, Collection, Cluster)) %>%
ggplot() +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Percentage of reads", width = 30),
subtitle = str_wrap(paste(""), width = 70)) +
geom_boxplot(aes(x = Cluster, y = Total_insertions, fill = Cluster), alpha = .8) +
Fill_Cluster + Color_Cluster +
labs(title = "TE insertions per collection per cluster") +
facet_wrap(vars(Collection), scales = "free")
Second, I check whether a plot and statistics including the depth of coverage still recovers the difference between groups that I have observed above.
threshold = 10 #Number of minimum presence count
#
temp = inner_join(filter(TE_insertions_counts, n > 10), TE_insertions) %>%
inner_join(dplyr::select(non_pop_spe, TE_insertion)) %>%
group_by(ID_file, Continent) %>%
dplyr::count() %>%
inner_join(depth, by = c("ID_file" = "INDV"))
ggplot(temp, aes(x = MEAN_DEPTH, y = n, col = Continent)) +
geom_point(alpha = .5) +
theme_bw() +
labs(title = "Depth bias check after TIP comparison between continent",
subtitle = "Only non-population specific",
x = "Mean depth of coverage",
y = paste0("Number of TIP with more than ", threshold, " presence")) +
Color_Continent
ggplot(temp, aes(x = MEAN_DEPTH, y = n, col = Continent)) +
geom_point(alpha = .5) +
theme_bw() +
labs(title = "Depth bias check after TIP comparison between continent",
subtitle = "Only non-population specific",
x = "Mean depth of coverage",
y = paste0("Number of TIP with more than ", threshold, " presence")) +
Color_Continent +
geom_smooth(method = "glm", formula = y~log(x), se = F,
method.args = list(family = gaussian(link = 'log')))
#
inner_join(filter(TE_insertions_counts, n > threshold), TE_insertions) %>%
group_by(ID_file, Continent, Cluster) %>%
dplyr::count() %>%
inner_join(depth, by = c("ID_file" = "INDV")) %>%
ggplot(aes(x = MEAN_DEPTH, y = n, col = Cluster, shape = Cluster)) +
geom_point(alpha = .5) +
theme_bw() +
labs(title = "Depth bias check TIP comparison between continent",
subtitle = "All TIPs (population specific and others)",
x = "Mean depth of coverage",
y = paste0("Number of TIP with more than ", threshold, " presence")) +
scale_shape_manual(values = c(0,1,2,3,4,5,6,7,8,9,10)) +
Color_Cluster #+
# geom_smooth(method = "lm", se = F)
inner_join(filter(TE_insertions_counts, n > threshold), TE_insertions) %>%
group_by(ID_file, Continent, Cluster) %>%
dplyr::count() %>%
inner_join(depth, by = c("ID_file" = "INDV")) %>%
ggplot(aes(x = MEAN_DEPTH, y = n, col = Cluster, shape = Cluster)) +
geom_point(alpha = .5) +
theme_bw() +
labs(title = "Depth bias check TIP comparison between continent",
subtitle = "All TIPs (population specific and others)",
x = "Mean depth of coverage",
y = paste0("Number of TIP with more than ", threshold, " presence")) +
scale_shape_manual(values = c(0,1,2,3,4,5,6,7,8,9,10)) +
Color_Cluster
model = lm(n ~ Continent + MEAN_DEPTH, data = temp)
summary(model)
##
## Call:
## lm(formula = n ~ Continent + MEAN_DEPTH, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -218.540 -25.645 7.865 26.356 76.783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.11554 6.02568 18.274 < 2e-16 ***
## ContinentEurope 43.05024 6.53690 6.586 7.97e-11 ***
## ContinentMiddle East -8.88905 7.08540 -1.255 0.21
## ContinentNorth America 59.96574 6.34281 9.454 < 2e-16 ***
## ContinentOceania 62.99936 6.87559 9.163 < 2e-16 ***
## ContinentSouth America 63.78178 7.71384 8.268 5.28e-16 ***
## MEAN_DEPTH 2.58642 0.08561 30.213 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.56 on 840 degrees of freedom
## Multiple R-squared: 0.6978, Adjusted R-squared: 0.6957
## F-statistic: 323.3 on 6 and 840 DF, p-value: < 2.2e-16
We can see that different clusters and continent have a different overall TE content. However, I am curious to see if there is a geographical clustering intrinsic to the TE content that can be uncovered without an a priori sorting of samples.
First, I will use the ngs_te_mapper2 results, and use each insertion identified in more than 5 samples to create different clustering. First, I’ll simply create a heatmap, then I’ll move on to PCAs, first by cluster, then with each isolate.
matrice = TE_insertions %>%
#filter(!is.na(Cluster)) %>%
inner_join(dplyr::select(non_pop_spe, TE_insertion)) %>%
inner_join(filter(TE_insertions_counts, n > 5))%>%
dplyr::count(TE_insertion, Cluster) %>%
left_join(nb_per_cluster) %>%
mutate(Freq = n/Nb_in_cluster) %>%
dplyr::select(Cluster, TE_insertion, Freq) %>%
pivot_wider(names_from = TE_insertion, values_from = Freq, values_fill = 0)
#Heatmap per genetic cluster
temp = as.matrix(matrice[,c(3:ncol(matrice))])[, which(apply(as.matrix(matrice[,c(3:ncol(matrice))]), 2, var) != 0)]
rownames(temp) = matrice$Cluster
pheatmap(temp, show_colnames = F)
#PCA per cluster
TE.pca = prcomp(temp, center = TRUE,scale. = TRUE)
temp = as.tibble(cbind(matrice, as.data.frame(TE.pca$x))) %>%
dplyr::select(Cluster, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8)
p1 = ggplot(temp, aes(x = PC1, y = PC2, label = Cluster, col = Cluster)) +
geom_text() +
Color_Cluster +
theme_bw() + theme(legend.position = "none")
p2 = ggplot(temp, aes(x = PC3, y = PC4, label = Cluster, col = Cluster)) +
geom_text() +
Color_Cluster +
theme_bw()+ theme(legend.position = "none")
p3 = ggplot(temp, aes(x = PC5, y = PC6, label = Cluster, col = Cluster)) +
geom_text() +
Color_Cluster +
theme_bw()+ theme(legend.position = "none")
p4 = ggplot(temp, aes(x = PC7, y = PC8, label = Cluster, col = Cluster)) +
geom_text() +
Color_Cluster +
theme_bw()+ theme(legend.position = "none")
cowplot::plot_grid(p1, p2, p3, p4)
#PCA per isolate
matrice = TE_insertions %>%
filter(!is.na(Cluster)) %>%
inner_join(filter(TE_insertions_counts, n > 10)) %>%
dplyr::count(TE_insertion, ID_file, Cluster) %>%
group_by(ID_file) %>%
mutate(Count_strain = sum(n)) %>%
pivot_wider(names_from = TE_insertion, values_from = n, values_fill = 0)
temp = as.matrix(matrice[,c(3:ncol(matrice))])[, which(apply(as.matrix(matrice[,c(3:ncol(matrice))]), 2, var) != 0)]
TE.pca = prcomp(temp, center = TRUE, scale. = TRUE)
#And now all against all but not interactive
temp = as.tibble(cbind(matrice, as.data.frame(TE.pca$x))) %>%
dplyr::select(ID_file, Cluster, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8)
p1 = ggplot(temp, aes(x = PC1, y = PC2, col = Cluster)) +
geom_point() +
Color_Cluster +
theme_bw() + theme(legend.position = "none")
p2 = ggplot(temp, aes(x = PC3, y = PC4, col = Cluster)) +
geom_point() +
Color_Cluster +
theme_bw() + theme(legend.position = "none")
p3 = ggplot(temp, aes(x = PC5, y = PC6, col = Cluster)) +
geom_point() +
Color_Cluster +
theme_bw() + theme(legend.position = "none")
p4 = ggplot(temp, aes(x = PC7, y = PC8, col = Cluster)) +
geom_point() +
Color_Cluster +
theme_bw() + theme(legend.position = "none")
cowplot::plot_grid(p1, p2, p3, p4)
It does look like there is clustering of samples according to geography. This is interesting as it shows that the types of TEs are different in different populations.
temp = TE_insertions %>%
filter(Allele_Frequency >= threshold | is.na(Allele_Frequency)) %>%
dplyr::count(Continent, Order, Superfamily) %>%
group_by(Continent, Order) %>%
mutate(Count_Order = sum(n)) %>%
group_by(Continent) %>%
mutate(Count_Continent = sum(n)) %>%
ungroup() %>%
mutate(Prop_superfamily = n/Count_Continent,
Prop_Order = n/Count_Continent)
ggplot(temp, aes(x = Continent, y = Prop_superfamily, fill = Superfamily)) +
geom_bar(stat = "identity") +
theme_bw() +
facet_grid(rows = vars(Order))
temp = TE_insertions %>%
filter(Allele_Frequency >= threshold | is.na(Allele_Frequency)) %>%
dplyr::count(Continent, ref_non_ref_TIP) %>%
group_by(Continent, ref_non_ref_TIP) %>%
mutate(Count_type = sum(n)) %>%
group_by(Continent) %>%
mutate(Count_Continent = sum(n)) %>%
ungroup() %>%
mutate(Prop_type = n/Count_Continent)
ggplot(temp, aes(x = Continent, y = Prop_type, fill = ref_non_ref_TIP)) +
geom_bar(stat = "identity") +
theme_bw()
Let’s do the same with the mapping TE content method! In this case, I don’t know which reads belong to which exact copy of a TE, but I can look at the reads aligning on each consensus sequence. I will thus make a PCA based on the proportion of reads mapping on each TE.
reads_per_TE = read_delim(paste0(RIP_DIR, "Nb_reads_per_TE.txt"), delim = "\t",
col_names = c("ID_file", "TE", "Length",
"# mapped read-segments", "# unmapped read-segments")) %>%
filter(TE != "*") %>%
separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
dplyr::mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)")) %>%
left_join(Zt_meta %>% dplyr::select(ID_file, Collection, Country, Continent)) %>%
unite(Continent, Country, ID_file, col = "for_display", remove = F)
temp = reads_per_TE %>% group_by(ID_file) %>%
dplyr::summarise(Reads_mapped_per_TE = sum(`# mapped read-segments`))
reads_per_TE = left_join(reads_per_TE, temp) %>%
dplyr::mutate(Normalized_nb_reads_mapped = `# mapped read-segments` / Reads_mapped_per_TE)
TE_PCA_mat = reads_per_TE %>%
dplyr::select(ID_file, Continent, Collection, for_display, TE, Normalized_nb_reads_mapped) %>%
spread(key = TE, value = as.numeric(Normalized_nb_reads_mapped))
temp = as.matrix(TE_PCA_mat[,c(5:ncol(TE_PCA_mat))])[, which(apply(as.matrix(TE_PCA_mat[,c(5:ncol(TE_PCA_mat))]), 2, var) != 0)]
TE.pca = prcomp(temp, center = TRUE,scale. = TRUE)
temp = as.tibble(cbind(TE_PCA_mat, as.data.frame(TE.pca$x))) %>%
dplyr::select(for_display, Continent, PC1, PC2, PC3, PC4)
p = ggpairs(temp, columns = c(3:6), ggplot2::aes(col=Continent, fill = Continent, alpha = 0.6),
title = "PCA based on normalized reads mapping on each TE consensus",
upper = list(continuous = "points", combo = "box_no_facet"))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] + theme_bw() + Color_Continent +Fill_Continent
}
}
p
#And now all against all but not interactive
temp = as.tibble(cbind(TE_PCA_mat, as.data.frame(TE.pca$x))) %>%
dplyr::select(for_display, Collection, PC1, PC2, PC3, PC4)
p = ggpairs(temp, columns = c(3:6), ggplot2::aes(col=Collection, fill = Collection, alpha = 0.6),
title = "PCA based on normalized reads mapping on each TE consensus",
upper = list(continuous = "points", combo = "box_no_facet"))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] + theme_bw()
}
}
p
I want to gain some insights about where, in the genome, are these TIPs we have detected: are they on specific chromosomes? Is there a difference between core and accessory chromosomes?
I’ll consider the number of TIP and normalize it by length to take into account the widely different chromosome size.
chromosomes_lengths = readxl::read_excel(paste0(metadata_dir, "Chromosomes_cumulative_lengths.xlsx"))
p1 = TE_insertions %>%
dplyr::count(Order, CHR, ID_file) %>%
inner_join(dplyr::select(chromosomes_lengths, CHR, Length) %>% mutate(CHR = as.character(CHR))) %>%
ggplot(aes(x = reorder(as.character(CHR), as.numeric(CHR)), y = n/(Length/1000000))) +
geom_boxplot() +
theme_bw() +
theme(axis.title = element_text(size = 10))+
facet_wrap(vars(Order), scales = "free") +
labs(x = "", y = "Per-sample insertions per Mb")
p2 = TE_insertions %>%
dplyr::count(CHR, ref_non_ref_TIP, Order) %>%
inner_join(dplyr::select(chromosomes_lengths, CHR, Length) %>% mutate(CHR = as.character(CHR))) %>%
ggplot(aes(x = reorder(as.character(CHR), as.numeric(CHR)), y = n/(Length/1000000), fill = ref_non_ref_TIP)) +
geom_bar(stat = "identity") +
theme_bw() +
facet_wrap(vars(Order), scales = "free") +
scale_fill_manual(values = c("#FFD399", "#cbf3f0")) +
theme(legend.position = "bottom", axis.title = element_text(size = 10)) +
labs(x = "", y = "Number insertions per Mb", fill = "Insertion type")
cowplot::plot_grid(p1, p2, ncol = 1, rel_heights = c(1, 1.2), align = "hv")
It looks like the accessory chromosomes might be a bit higher than the core chromosome. I also want to get finer in the scale: are there places in the genome which tend to have more TIPs than others?
TE_insertions_per_window = TE_insertions %>%
mutate(CHR = as.numeric(CHR),
window = trunc(((position + end)/2)/10000)) %>%
dplyr::select(TE_insertion, CHR, window) %>%
distinct() %>%
group_by(CHR, window) %>%
dplyr::count()
summar = ungroup(TE_insertions_per_window) %>%
summarise(median_TE = median(n),
sd_TE = sd(n))
TE_insertions_per_window = TE_insertions_per_window %>%
#inner_join(summar) %>%
mutate(outlier = ifelse(n >= summar$median_TE + 3*summar$sd_TE, "outlier", "non_outlier"))
TE_insertions_per_window %>%
ungroup() %>%
mutate(Start = window*10000, End = Start + 10000) %>%
dplyr::select(-window, TIP_count = n, TIP_category = outlier) %>%
write_tsv(file = paste0(TE_RIP_dir, "Ztritici_global_March2021.good_samples.10kb_windows.TIP_counts.tab"))
filter(TE_insertions_per_window, as.numeric(CHR) < 8) %>%
ggplot(aes(x = window, y = n, col = as.character(outlier))) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("#f5cac3", "#f28482")) +
geom_point(alpha = .8) +
geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
linetype="dashed", col = "grey80") +
facet_grid(rows = vars(CHR), scales = "free_x")
filter(TE_insertions_per_window, as.numeric(CHR) >= 8 & as.numeric(CHR) < 14) %>%
ggplot(aes(x = window, y = n, col = outlier)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("#f5cac3", "#f28482")) +
geom_point(alpha = .8) +
geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
linetype="dashed", col = "grey80") +
facet_grid(rows = vars(CHR), scales = "free_x")
filter(TE_insertions_per_window, as.numeric(CHR) >= 14) %>%
ggplot(aes(x = window, y = n, col = outlier)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("#f5cac3", "#f28482")) +
geom_point(alpha = .8) +
geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
linetype="dashed", col = "grey80") +
facet_grid(rows = vars(CHR), scales = "free_x")
label1 = ungroup(TE_insertions_per_window) %>%
filter(!is.na(CHR)) %>%
dplyr::summarize(Average = mean(n)) %>% pull()
p1 = ungroup(TE_insertions_per_window) %>%
filter(!is.na(CHR)) %>%
group_by(CHR) %>%
dplyr::summarize(Average = mean(n)) %>%
dplyr::mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
ggplot(aes(x = reorder(as.character(CHR), CHR), y = Average, fill = Chromosome_type)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = label1, linetype = "dashed", col = "grey40") +
theme_bw() +
scale_fill_manual(values = c("#2ec4b6","#cbf3f0")) +
labs(title = paste0("GW TIP number average is ", round(label1, 2)),
x = "Chromosome", y = "Mean # TIPs per 10kb") +
theme(legend.position = "none")
p2 = ungroup(TE_insertions_per_window) %>%
dplyr::count(CHR, outlier) %>%
group_by(CHR) %>%
dplyr::mutate(Tot_window = sum(n)) %>%
mutate(prop = 100*n/Tot_window) %>%
mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
filter(outlier == "outlier") %>%
ggplot(aes(x = reorder(as.character(CHR), CHR), y = prop, fill = Chromosome_type)) +
geom_bar(stat = "identity") +
theme_bw() +
scale_fill_manual(values = c("#2ec4b6","#cbf3f0")) +
labs(title = "Proportion of outlier windows",
x = "Chromosome", y = "% outlier windows") +
theme(legend.position = "none") +
geom_text(aes(y = 22, label = paste0(round(prop, 2), "%")), size = 3)
cowplot::plot_grid(p1 + coord_flip(), p2 + coord_flip())
ungroup(TE_insertions_per_window) %>%
dplyr::count(CHR, outlier) %>%
group_by(CHR) %>%
dplyr::mutate(Tot_window = sum(n)) %>%
mutate(prop = 100*n/Tot_window) %>%
filter(outlier == "outlier")
## # A tibble: 19 x 5
## # Groups: CHR [19]
## CHR outlier n Tot_window prop
## <dbl> <chr> <int> <int> <dbl>
## 1 1 outlier 4 592 0.676
## 2 2 outlier 3 369 0.813
## 3 3 outlier 7 330 2.12
## 4 4 outlier 2 271 0.738
## 5 5 outlier 7 269 2.60
## 6 6 outlier 6 249 2.41
## 7 7 outlier 4 253 1.58
## 8 8 outlier 4 226 1.77
## 9 9 outlier 7 206 3.40
## 10 10 outlier 5 161 3.11
## 11 11 outlier 3 158 1.90
## 12 13 outlier 4 115 3.48
## 13 14 outlier 3 70 4.29
## 14 15 outlier 5 62 8.06
## 15 17 outlier 5 52 9.62
## 16 18 outlier 1 51 1.96
## 17 19 outlier 4 52 7.69
## 18 20 outlier 1 45 2.22
## 19 21 outlier 3 37 8.11
ungroup(TE_insertions_per_window) %>% dplyr::count(outlier)
## # A tibble: 2 x 2
## outlier n
## <chr> <int>
## 1 non_outlier 3691
## 2 outlier 78
There is one window that is much higher than anything else on chromosome 12. This is strange, so I started looking more specifically at this partilar locus. Is the increase in TIPs due to only one type of TE? Is it even more localized than the 10kb-window I’ve looked at?
temp = TE_insertions %>% filter(CHR == 12) %>%
mutate(window = trunc(((position + end)/2)/10000)) %>% filter(window == 52)
p1 = temp %>%
dplyr::select(Superfamily, TE_family, TE_insertion ) %>%
distinct() %>%
ggplot(aes(x = Superfamily, fill = TE_family)) +
geom_bar() +
scale_fill_manual(values = rep(c("#f6bd60", "#f6ca83", "#f7ede2", "#f5cac3", "#84a59d", "#73683b", "#d0d38f", "#cd5b72", "#f28482"), 10))
p1 + theme(legend.position = "none")
cowplot::plot_grid(get_legend(p1))
temp %>%
mutate(pos = trunc((position + end)/200)) %>%
group_by(TE_insertion, TE_family, Superfamily, ref_non_ref_TIP, pos) %>%
ggplot(aes(x = pos*100, fill = Continent)) +
geom_bar() +
theme_bw() +
facet_grid(rows = vars(Superfamily), scales = "free_y") +
Fill_Continent
bind_rows(read_delim(paste0(TE_RIP_dir, "Non-ref_all_strains.bed"),
col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand"), delim = "\t") %>%
mutate(ref_non_ref_TIP = "non_ref"),
read_delim(paste0(TE_RIP_dir, "Ref_all_strains.bed"),
col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand"), delim = "\t") %>%
mutate(ref_non_ref_TIP = "ref")) %>%
separate(col = chromosome, into = c("X1", "CHR", "X2", "X3", "X4"), sep = "_") %>%
separate(col = info, into = c("TE_family", "TSD", "Allele_Frequency", "3_support", "5_support", "ref_reads"), sep = "\\|") %>%
dplyr::select(-c(X1, X2, X3, X4, score, strand)) %>%
dplyr::select(CHR, position, end, TE_family) %>%
distinct() %>%
write_tsv(paste0(TE_RIP_dir, "Insertions.real.bed"), col_names = F)
gff = read_delim(gff_file, col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot"),
delim = "\t", comment = "#") %>%
filter(mRNA == "mRNA") %>%
separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
mutate(Name = str_remove(Name, "Name="))
~/Documents/Software/bedtools2/bin/bedtools sort -i ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3 | grep "mRNA" > ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.mRNA.gff3
~/Documents/Software/bedtools2/bin/bedtools sort -i /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions.real.bed > /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions.sorted.bed
~/Documents/Software/bedtools2/bin/bedtools closest -a ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions.sorted.bed -b ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.mRNA.gff3 -d > ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions_closest_genes.tab
distance_TE_genes = read_delim(paste0(TE_RIP_dir, "Insertions_closest_genes.tab"),
col_names = c("CHR", "position", "end", "TE_family", "X5", "X1", "mRNA",
"Start", "Stop", "X2", "X3", "X4", "Annot", "Distance"), delim = "\t") %>%
dplyr::select(-c(X1, X2, X3, X4, X5, mRNA)) %>%
mutate(CHR = as.character(CHR)) %>%
mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F)
temp = distance_TE_genes %>%
full_join(TE_insertions)
TE_insertions %>%
unite(Continent, ID_file, col = "for_display", remove = F) %>%
dplyr::count(TE_insertion)
## # A tibble: 80,511 x 2
## TE_insertion n
## <chr> <int>
## 1 DHH_Ada_consensus-2:1:1002400:1002400 1
## 2 DHH_Ada_consensus-2:1:1002500:1002500 1
## 3 DHH_Ada_consensus-2:1:1043800:1043800 1
## 4 DHH_Ada_consensus-2:1:1062900:1062900 2
## 5 DHH_Ada_consensus-2:1:1073200:1073200 1
## 6 DHH_Ada_consensus-2:1:1102700:1102700 2
## 7 DHH_Ada_consensus-2:1:1102800:1102800 1
## 8 DHH_Ada_consensus-2:1:112100:112100 1
## 9 DHH_Ada_consensus-2:1:1122500:1122500 6
## 10 DHH_Ada_consensus-2:1:1128600:1128600 1
## # … with 80,501 more rows
#Categories based on allelic frequencies
p1 = TE_insertions_counts %>%
mutate(for_bar = case_when(n == 1 ~ "01",
n == 2 ~ "02",
n <= 10 ~ "10",
n <= 100 ~ "100",
n >= 100 ~ "100 +")) %>%
inner_join(distance_TE_genes) %>%
ggplot(aes(x = for_bar, y = log(Distance))) +
geom_violin() +
geom_boxplot(width = .2) +
theme_bw()
#Categories based on population specificity
temp = insertions_per_pop %>%
filter(Nb_population == 1) %>%
dplyr::select(-c(NA_count, Nb_population, n)) %>%
pivot_longer(cols = -c(TE_insertion, ref_non_ref_TIP), names_to = "Specificity_category", values_to = "Nb") %>%
filter(!is.na(Nb)) %>%
dplyr::select(-Nb)
p2 = insertions_per_pop %>%
filter(Nb_population > 1) %>%
dplyr::select(TE_insertion) %>%
mutate(Specificity_category = "Non_specific") %>%
bind_rows(temp) %>%
inner_join(distance_TE_genes) %>%
ggplot(aes(x = Specificity_category, y = log(Distance))) +
geom_violin() +
geom_boxplot(width = .2) +
theme_bw()
cowplot::plot_grid(p1, p2)
#
temp = inner_join(read_delim(paste0(TE_RIP_dir, "Insertions_closest_genes.downstream.tab"),
col_names = c("CHR", "position", "end", "TE_family", "X5", "X1", "mRNA",
"Start", "Stop", "X2", "X3", "X4", "Annot", "Distance"), delim = "\t") %>%
mutate(CHR = as.character(CHR)) %>%
mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
dplyr::select(TE_insertion, Distance_downstream = Distance),
read_delim(paste0(TE_RIP_dir, "Insertions_closest_genes.upstream.tab"),
col_names = c("CHR", "position", "end", "TE_family", "X5", "X1", "mRNA",
"Start", "Stop", "X2", "X3rm", "X4", "Annot", "Distance"), delim = "\t") %>%
mutate(CHR = as.character(CHR)) %>%
mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
dplyr::select(TE_insertion, Distance_upstream = Distance))
ggplot(temp, aes(x = abs(Distance_downstream), y = abs(Distance_upstream))) +
geom_hex() +
theme_bw() +
scale_x_continuous(trans = "log10")+
scale_y_continuous(trans = "log10")
temp = inner_join(temp, TE_insertions_counts)
p1 = temp %>% filter(ref_non_ref_TIP == "ref") %>%
ggplot(aes(x = abs(Distance_downstream), y = abs(Distance_upstream))) +
geom_hex() +
theme_bw() +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
labs(title = "Distance to gene of reference TIP") +
theme(legend.position = "bottom")
p2 = temp %>% filter(ref_non_ref_TIP != "ref") %>%
ggplot(aes(x = abs(Distance_downstream), y = abs(Distance_upstream))) +
geom_hex() +
theme_bw() +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
labs(title = "Distance to gene of non-reference TIP")+
theme(legend.position = "bottom")
plot_grid(p1, p2)
I now look at the repeat-induced point mutations in reads that map on the different TE consensus. We expect to see differences in the different geographical groups so I start by visualizing this.
#while read p; do fichier_list=$(ls -1 /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/${p}\.*txt) ; for fichier in $fichier_list; do short_name=$(echo $fichier | cut -f 8 -d "/" | cut -f 2 -d "." ) ; comp=$(grep Composite $fichier) ; echo $p $short_name $comp ; done; done < Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args > /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
RIP=read_delim(paste0(RIP_DIR, "Composite_index.txt"),
col_names = c("ID_file", "TE", "Variable", "Composite_median", "Composite_mean" ),
delim = " ", na = c("nan", "NA", "")) %>%
separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)"))%>%
left_join(Zt_meta %>% dplyr::select(Continent, Country, Collection, ID_file)) %>%
unite(Continent, Country, ID_file, col = "for_display", remove = F) %>%
left_join(., reads_per_TE)
#DONE: merge with reads_per_TE to be able to filter the points that have too few reads mapped!
#Per continent
world_RIP_avg <-
RIP %>%
filter(TE == "RIP_est") %>%
dplyr::summarize(avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
pull(avg)
temp = RIP %>%
filter(TE == "RIP_est") %>%
filter(!is.na(Continent)) %>%
group_by(Continent) %>%
dplyr::mutate(region_avg = mean(as.numeric(Composite_median), na.rm = T))
RIP_plot = ggplot(temp, aes(x = Continent,
y = as.numeric(Composite_median),
color = Continent)) +
coord_flip() +
geom_segment(aes(x = Continent, xend = Continent,
y = world_RIP_avg, yend = region_avg), size = 0.8) +
geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
geom_hline(aes(yintercept = world_RIP_avg), color = "gray70", size = 0.6) +
stat_summary(fun = mean, geom = "point", size = 5) +
Color_Continent +
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = "RIP composite index",
title = "RIP levels per continents",
subtitle = str_wrap(paste("The RIP levels in reads mapping on TE consensus",
"are high in the Middle-East",
"and low in North America in particular."), width = 70))
#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(Composite_median) ~ Continent + Collection ,
data=temp)
summary(model) ### Will show overall p-value and r-squared
##
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent + Collection,
## data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45164 -0.04214 0.00525 0.04319 0.50785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.76372 0.03086 57.144 < 2e-16 ***
## ContinentAsia -0.13501 0.10119 -1.334 0.1824
## ContinentEurope -0.22017 0.02011 -10.948 < 2e-16 ***
## ContinentMiddle East 0.13058 0.02027 6.443 1.77e-10 ***
## ContinentNorth America -0.44765 0.01948 -22.974 < 2e-16 ***
## ContinentOceania -0.40964 0.02634 -15.553 < 2e-16 ***
## ContinentSouth America -0.31207 0.02138 -14.598 < 2e-16 ***
## CollectionEschikon_2017 -0.15579 0.02544 -6.123 1.29e-09 ***
## CollectionETHZ_2020 -0.04931 0.02794 -1.765 0.0779 .
## CollectionFillinger -0.15375 0.07216 -2.131 0.0334 *
## CollectionGWASpanel_BIOGER -0.13263 0.02553 -5.195 2.45e-07 ***
## CollectionHartmann_FstQst_2015 -0.36444 0.02693 -13.532 < 2e-16 ***
## CollectionHartmann_Oregon_2016 -0.16815 0.02937 -5.725 1.34e-08 ***
## CollectionJGI 0.02059 0.03034 0.678 0.4976
## CollectionJGI_2 -0.01836 0.03006 -0.611 0.5414
## CollectionJGI_3 -0.16092 0.03328 -4.835 1.53e-06 ***
## CollectionJGI_4 -0.10925 0.09933 -1.100 0.2717
## CollectionJGI_Thierry -0.06220 0.02896 -2.148 0.0319 *
## CollectionMegan_McDonald -0.07177 0.04165 -1.723 0.0851 .
## CollectionMMcDonald_2018 -0.02227 0.03345 -0.666 0.5058
## CollectionSyngenta -0.23710 0.02411 -9.836 < 2e-16 ***
## CollectionThird_batch_2018_BM_TM -0.05951 0.03457 -1.721 0.0855 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09653 on 1067 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.7924, Adjusted R-squared: 0.7883
## F-statistic: 194 on 21 and 1067 DF, p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: as.numeric(Composite_median)
## Sum Sq Df F value Pr(>F)
## Continent 17.3013 6 309.435 < 2.2e-16 ***
## Collection 11.2446 15 80.445 < 2.2e-16 ***
## Residuals 9.9431 1067
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent + Collection,
## data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45164 -0.04214 0.00525 0.04319 0.50785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.76372 0.03086 57.144 < 2e-16 ***
## ContinentAsia -0.13501 0.10119 -1.334 0.1824
## ContinentEurope -0.22017 0.02011 -10.948 < 2e-16 ***
## ContinentMiddle East 0.13058 0.02027 6.443 1.77e-10 ***
## ContinentNorth America -0.44765 0.01948 -22.974 < 2e-16 ***
## ContinentOceania -0.40964 0.02634 -15.553 < 2e-16 ***
## ContinentSouth America -0.31207 0.02138 -14.598 < 2e-16 ***
## CollectionEschikon_2017 -0.15579 0.02544 -6.123 1.29e-09 ***
## CollectionETHZ_2020 -0.04931 0.02794 -1.765 0.0779 .
## CollectionFillinger -0.15375 0.07216 -2.131 0.0334 *
## CollectionGWASpanel_BIOGER -0.13263 0.02553 -5.195 2.45e-07 ***
## CollectionHartmann_FstQst_2015 -0.36444 0.02693 -13.532 < 2e-16 ***
## CollectionHartmann_Oregon_2016 -0.16815 0.02937 -5.725 1.34e-08 ***
## CollectionJGI 0.02059 0.03034 0.678 0.4976
## CollectionJGI_2 -0.01836 0.03006 -0.611 0.5414
## CollectionJGI_3 -0.16092 0.03328 -4.835 1.53e-06 ***
## CollectionJGI_4 -0.10925 0.09933 -1.100 0.2717
## CollectionJGI_Thierry -0.06220 0.02896 -2.148 0.0319 *
## CollectionMegan_McDonald -0.07177 0.04165 -1.723 0.0851 .
## CollectionMMcDonald_2018 -0.02227 0.03345 -0.666 0.5058
## CollectionSyngenta -0.23710 0.02411 -9.836 < 2e-16 ***
## CollectionThird_batch_2018_BM_TM -0.05951 0.03457 -1.721 0.0855 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09653 on 1067 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.7924, Adjusted R-squared: 0.7883
## F-statistic: 194 on 21 and 1067 DF, p-value: < 2.2e-16
hist(residuals(model), col="darkgray")
#Post-hoc analysis: mean separation tests
marginal = lsmeans(model, ~ Continent)
pairs(marginal, adjust="sidak")
## contrast estimate SE df t.ratio p.value
## Africa - Asia 0.1350 0.1012 1067 1.334 0.9854
## Africa - Europe 0.2202 0.0201 1067 10.948 <.0001
## Africa - Middle East -0.1306 0.0203 1067 -6.443 <.0001
## Africa - North America 0.4476 0.0195 1067 22.974 <.0001
## Africa - Oceania 0.4096 0.0263 1067 15.553 <.0001
## Africa - South America 0.3121 0.0214 1067 14.598 <.0001
## Asia - Europe 0.0852 0.0998 1067 0.853 1.0000
## Asia - Middle East -0.2656 0.1006 1067 -2.639 0.1630
## Asia - North America 0.3126 0.1004 1067 3.113 0.0391
## Asia - Oceania 0.2746 0.1018 1067 2.697 0.1390
## Asia - South America 0.1771 0.1007 1067 1.758 0.8223
## Europe - Middle East -0.3508 0.0158 1067 -22.182 <.0001
## Europe - North America 0.2275 0.0147 1067 15.499 <.0001
## Europe - Oceania 0.1895 0.0217 1067 8.744 <.0001
## Europe - South America 0.0919 0.0188 1067 4.894 <.0001
## Middle East - North America 0.5782 0.0146 1067 39.586 <.0001
## Middle East - Oceania 0.5402 0.0221 1067 24.417 <.0001
## Middle East - South America 0.4427 0.0183 1067 24.160 <.0001
## North America - Oceania -0.0380 0.0207 1067 -1.839 0.7624
## North America - South America -0.1356 0.0174 1067 -7.781 <.0001
## Oceania - South America -0.0976 0.0251 1067 -3.886 0.0023
##
## Results are averaged over the levels of: Collection
## Note: contrasts are still on the as.numeric scale
## P value adjustment: sidak method for 21 tests
CLD_RIP = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "sidak") ### sidak-adjusted p-values
CLD_RIP
## Continent lsmean SE df lower.CL upper.CL .group
## North America 1.21 0.0132 1067 1.17 1.24 a
## Oceania 1.25 0.0191 1067 1.19 1.30 ab
## South America 1.34 0.0180 1067 1.29 1.39 c
## Europe 1.43 0.0104 1067 1.41 1.46 d
## Asia 1.52 0.0997 1067 1.25 1.79 bcdef
## Africa 1.65 0.0195 1067 1.60 1.71 e
## Middle East 1.79 0.0149 1067 1.75 1.83 f
##
## Results are averaged over the levels of: Collection
## Results are given on the as.numeric (not the response) scale.
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 7 estimates
## Note: contrasts are still on the as.numeric scale
## P value adjustment: sidak method for 21 tests
## significance level used: alpha = 0.05
## NOTE: Compact letter displays can be misleading
## because they show NON-findings rather than findings.
## Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
CLD_RIP$.group=gsub(" ", "", CLD_RIP$.group)
text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))
RIP_plot +
geom_text(data = CLD_RIP, aes(x = Continent,
y = text_y,
label = .group), color = "black")
It is known that different TE groups, in particular the MITEs, which are particularly small are less RIPped than other types of TEs. I wanted to check whether we saw such a pattern and so I visualize here the RIP per superfamily of TEs and then as related to the size of the consensus.
#Per TE superfamily
RIP %>%
filter(Normalized_nb_reads_mapped > 0.0001) %>%
group_by(Superfamily)%>%
mutate(median_Superfamily=median(Composite_median, na.rm = T) )%>%
ggplot(aes(x = Superfamily,
y = as.numeric(Composite_median),
fill = median_Superfamily)) +
geom_boxplot(outlier.shape = NA) +
theme_bw() +
ylab("Median of composite index on TE reads") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy")
temp = RIP %>%
filter(Normalized_nb_reads_mapped > 0.0001) %>%
group_by(Superfamily, Continent, Order)%>%
dplyr::summarize(median_Superfamily=median(Composite_median, na.rm = T) )
temp %>%
filter(Order == "Class I (retrotransposons)") %>%
ggplot(aes(x = Superfamily,
y = median_Superfamily,
fill = Continent)) +
geom_bar(stat = "identity", position=position_dodge()) +
Fill_Continent +
theme_bw() +
ylab("Median of composite index on TE reads") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
temp %>%
filter(Order == "Class II (DNA transposons)") %>%
ggplot(aes(x = Superfamily,
y = median_Superfamily,
fill = Continent)) +
geom_bar(stat = "identity", position=position_dodge()) +
Fill_Continent +
theme_bw() +
ylab("Median of composite index on TE reads") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
#As relating to TE size
#NB: in the following plot, the alpha parameter is set to make the TE without any reads (or near so) invisible
#This means that not all consensus are visible. In particular, some, annotated by Ursula as "verylowcopy" are not.
TE_consensus_faidx =read_delim(paste0(TE_RIP_dir, "Badet_BMC_Biology_2020_TE_consensus_sequences.fasta.fai"),
col_names = c("TE", "length", "offset",
"number of bases per line", "number of bytes per line"), delim = "\t")
p = inner_join(RIP, TE_consensus_faidx) %>%
dplyr::group_by(Order, TE, length) %>%
filter (TE != "RIP_est") %>%
filter(!is.na(Normalized_nb_reads_mapped)) %>%
dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
ggplot(aes(x = log(length),
y = median_per_consensus,
color = Order)) +
geom_point(aes( text = TE,
alpha = log(read_mapped))) + theme_bw() +
ylim(c(-2, 4)) + geom_hline(yintercept = 0) +
labs(x = "TE length (log-scale)",
y = "Median of RIP composite index",
title = str_wrap(paste("Median of the RIP composite index for each TE consensus",
"against the length of the consensus sequence"), width = 60)) # + geom_smooth(span = 1.5, fill = NA, size =0.7)
ggplotly(p)
p = inner_join(RIP, TE_consensus_faidx) %>%
filter (TE != "RIP_est") %>%
dplyr::group_by(Order, TE, length) %>%
dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
filter(str_detect(TE, pattern = "SINE") | str_detect(TE, pattern = "MITE") ) %>%
ggplot(aes(x = log(length),
y = median_per_consensus,
color = Order, text = TE,
alpha = log2(read_mapped))) +
geom_point() + theme_bw() +
ylim(c(-2, 4)) + geom_hline(yintercept = 0)
ggplotly(p)
Finally, I look at the relation between the amount of reads mapping on TE consensus and the level of RIP detected. I also investigate several possible bias.
TE_RIP = inner_join(TE_qty, RIP %>% filter(TE == "RIP_est"))
TE_RIP$Sampling_Date[is.na(TE_RIP$Sampling_Date)] <- "Unknown"
temp = TE_RIP %>%
group_by(Continent) %>%
dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
dplyr::mutate(for_display = Continent)
#TE and RIP together
t = ggplot(TE_RIP, aes(as.numeric(Percent_TE_Reads),
as.numeric(Composite_median),
color = Continent,
text = for_display))+
theme_cowplot() +
geom_point(alpha = 0.6) + Color_Continent +
labs(color = "Geographical group",
x = "Percentage of TE reads", y = "RIP composite median",
title = "Amount of transposable elements vs RIP level") +
geom_point(data = temp, aes(as.numeric(TE_qty),
as.numeric(Composite_median),
color = Continent), size = 5)
#ggplotly(t)
t
TE_RIP %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Total_insertions),
as.numeric(Composite_median),
color = Continent,
text = for_display))+
theme_cowplot() +
geom_point(alpha = 0.6) + Color_Continent +
labs(color = "Geographical group",
x = "TE insertion number", y = "RIP composite median",
title = "Amount of transposable elements vs RIP level")
bias1 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color = Collection, text = for_display)) +
theme_cowplot() +
geom_point(alpha = 0.8)
#bias2 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color =Library_strategy, text = for_display)) +
# theme_cowplot() +
# geom_point(alpha = 0.8)
ggplotly(bias1)
cowplot::plot_grid(RIP_plot +
labs (title = "", subtitle = ""), #+
#geom_text(data = CLD_RIP, aes(x = Continent,
# y = text_y,
# label = .group),
# color = "black"),
TE_prop +
labs (title = "", subtitle = "") +
#geom_text(data = CLD, aes(x = Continent,
# label = .group,
# y = 34),
# color = "black") +
theme(legend.position = "none") +
coord_flip())
subset = TE_RIP %>%
filter(Collection != "Hartmann_FstQst_2015")
temp = subset %>%
group_by(Continent) %>%
dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
dplyr::mutate(for_display = Continent)
t = ggplot(subset, aes(as.numeric(Percent_TE_Reads),
as.numeric(Composite_median),
color = Continent,
text = for_display)) +
theme_cowplot() +
geom_point(alpha = 0.6) + Color_Continent +
labs(color = "Geographical group",
x = "Percentage of TE reads", y = "RIP composite median",
title = "Amount of transposable elements vs RIP level") +
geom_point(data = temp, aes(as.numeric(TE_qty),
as.numeric(Composite_median),
color = Continent), size = 5)
ggplotly(t)
The RIP index does seem consistent so far with what Cécile has found, with higher RIP in the Middle-East and African populations and lower in the rest (in particular North America and Oceania).
RIP_in_high_copies_TE = read_delim(paste0(TE_RIP_dir, "All_genomes.all_TEs.GC.RIP.tab"),
col_names = c("Sample", "Seq_ID", "GC", "Product_index", "Substrate_index", "Composite_index"),
delim = "\t")
temp = RIP_in_high_copies_TE %>%
separate(Seq_ID, into = c("Sample", "Chrom", "Start", "End"),
sep = "\\.|-|:", remove = F) %>%
dplyr::mutate(Start = as.integer(Start), End = as.integer(End)) %>%
dplyr::mutate(Coord = (Start + End)/2)
#temp %>%
# filter(Chrom == "chr_4") %>%
# filter(Composite_index <= 2 & Composite_index >= - 2 )%>%
# ggplot(aes(Coord, Composite_index, col = Composite_index)) +
# geom_point() +
# facet_wrap (vars(Sample), ncol = 1) +
# theme_bw() +
# scale_color_viridis_c()
temp %>%
dplyr::mutate(Nb = str_pad(trunc((Coord / 100000), 0), 6, pad = "0")) %>%
filter(Composite_index >= -2 & Composite_index<= 2) %>%
#filter(Chrom == "chr_1") %>%
unite(Window, Chrom, Nb) %>%
group_by(Sample, Window) %>%
dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
ggplot(aes(Window, Sample, fill = Composite)) +
geom_tile() +
theme_bw() +
scale_fill_viridis_c()
temp %>%
filter(Composite_index >= -3 & Composite_index<= 3) %>%
mutate(Chr = as.integer(str_replace(Chrom, "chr_", ""))) %>%
group_by(Sample, Chr) %>%
dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
ggplot(aes(Chr, Sample, fill = Composite)) +
geom_tile() +
theme_bw() +
scale_fill_viridis_c()
Next step will be to check if dim2 is present where the RIP values suggest they are: intact copies in the Middle-East and Africa and absence/degeneration in the rest. Here, I use de novo genome assemblies and try to identify copies of dim2. For this, I use a deRIPped sequence of dim2 in the reference, blasted it on to Zt10 to get the sequence of Zt10_dim2 since it is known to have an intact sequence (see Mareike’s paper). I then compare this sequence (blastn) with de de novo assemblies to pull all the copies and identify the highest identity score.
#Here the Zt10_dim2_from_MgDNMT_deRIP.fa is the sequence of Zt10_6.417 which corresponds to the deRIPPed version of dim2 in the reference IPO323 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2940312/pdf/GEN186167.pdf
#sbatch -p normal.168h --array=1-1195%50 Detect_gene_blast_array.sh /home/alice/WW_PopGen/Directories_new.sh /data2/alice/WW_project/0_Data/Zt10_dim2_from_MgDNMT_deRIP.fa /home/alice/WW_PopGen/Keep_lists_samples/Good_assemblies.args Illumina /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/
Let’s look at the results as dot plots and compare the results from the dim2 blast to the RIP composite index. So far the Middle-Eastern samples seem quite similar to one another, whereas other regions contain way more variability such as Europe.
In order to identify the native dim2 copy in genomes, I look for several things:
#system(paste0("cat ", DIM2_DIR, "*.blast.tab > ", DIM2_DIR, "Overall_results_blast_dim2.txt"))
length_dim2 = 3846
length_flank1 = 3689
length_flank2 = 1222
threshold_length_dim2 = 0.8 * length_dim2
threshold_length_flank1 = 0.8 * length_flank1
threshold_length_flank2 = 0.8 * length_flank2
dim2_blast_results_complete = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.txt"), delim = " ",
col_names = c("sample", "gene", "qseqid", "sseqid", "pident", "length",
"mismatch", "gapopen", "qstart", "qend",
"sstart", "send", "evalue", "bitscore"))
flank1 = dim2_blast_results_complete %>%
filter(gene == "dim2_flank1_Zt10_unitig_006_0418") %>%
dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
dplyr::mutate(midcoord_flank1 = (sstart + send)/2) %>%
filter(length > threshold_length_flank1) %>%
pivot_wider(names_from = gene, values_from = sseqid) %>%
dplyr::select(-length, -sstart, -send) %>%
group_by(sample) %>%
dplyr::mutate(nb_gene = n(),
dim2_flank1 = dim2_flank1_Zt10_unitig_006_0418 )
flank2 = dim2_blast_results_complete %>%
filter(gene == "dim2_flank2_Zt10_unitig_006_0416") %>%
dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
dplyr::mutate(midcoord_flank2 = (sstart + send)/2) %>%
filter(length > threshold_length_flank2) %>%
pivot_wider(names_from = gene, values_from = sseqid) %>%
dplyr::select(-length, -sstart, -send) %>%
group_by(sample) %>%
dplyr::mutate(nb_gene = n(),
dim2_flank2 = dim2_flank2_Zt10_unitig_006_0416)
#First let's extract some information such as finding the middle coordinates for the
# 3 genes of interest
dim2_blast_results = dim2_blast_results_complete %>%
dplyr::filter(gene == "Zt10_dim2_from_MgDNMT_deRIP") %>%
full_join(., flank1, by = "sample") %>%
full_join(., flank2, by = "sample") %>%
dplyr::mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found"),
midcoord_flank1 = replace_na(midcoord_flank1, "0"),
midcoord_flank2 = replace_na(midcoord_flank2, "0")) %>%
dplyr::mutate(midcoord_flank1 = as.numeric(midcoord_flank1),
midcoord_flank2 = as.numeric(midcoord_flank2)) %>%
dplyr::mutate(ave_coord = (sstart + send)/2) %>%
dplyr::mutate(min_fl = ifelse(midcoord_flank1 > midcoord_flank2,
midcoord_flank2, midcoord_flank1)) %>%
dplyr::mutate(max_fl = ifelse(midcoord_flank1 > midcoord_flank2,
midcoord_flank1, midcoord_flank2))
#Now, let's compare the blast results of dim2 with the flanking genes
# (contig name and distance)
dim2_blast_results = dim2_blast_results %>%
dplyr::mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What"))))) %>%
dplyr::mutate(distance = ifelse(is_same_contig == "None", "No_distance",
ifelse(is_same_contig == "Both",
ifelse(ave_coord >= min_fl &
ave_coord <= max_fl ,
"Distance_OK", "Not_between"),
ifelse(is_same_contig == "Flank1",
ifelse((abs(midcoord_flank1 - ave_coord) <= 10000),
"Distance_OK", "Too_far"),
ifelse((abs(midcoord_flank2 - ave_coord) <= 10000),
"Distance_OK", "Too_far"))))) %>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0",
ifelse(distance == "Distance_OK", ">1", "0")))
#dplyr::select(sample, sseqid, length, dim2_flank1, dim2_flank2, is_same_contig) %>%
dim2_blast_results %>%
ggplot(aes(length)) +
geom_density(alpha = 0.7) +
theme_bw() +
geom_vline(aes(xintercept = threshold_length_dim2), color = "gray70", size = 0.6) +
labs(x = "Length (bp)",
y = "Density",
title = "Length of all blast matches against Zt10dim2",
subtitle = str_wrap(paste("There is a large range in the size of the matches.",
" In order to ignore very short matches, I could set up a threshold",
" visualized here as the grey line."),
width = 70))
However, the length also greatly varies for copies for which we were able to detect one or two of the flanking genes on the same contig.
p1 = dim2_blast_results %>%
ggplot(aes(length, fill = is_same_contig, color = is_same_contig)) +
geom_density(alpha = 0.7) +
theme_bw() +
labs(x = "Length (bp)",
y = "Density",
title = str_wrap("Length of all blast matches against Zt10dim2", width = 30))
p3 = dim2_blast_results %>%
ggplot(aes(Nb_flanking_found_2cat, fill = is_same_contig)) +
geom_bar(alpha = 0.7) +
theme_bw() +
labs ( x = "Number of flanking genes on the same contig",
y = "Number of copies",
title = str_wrap("Numbers of copies found in the same contig as the flanking genes ", width = 30)) +
theme(legend.position = "none")
legend_b <- get_legend(
p1 +
guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "bottom")
)
prow <- plot_grid(
p1 + theme(legend.position="none"),
p3 + theme(legend.position="none"),
align = 'vh',
labels = c("A", "B"),
hjust = -1,
nrow = 1)
plot_grid(prow, legend_b, ncol = 1, rel_heights = c(1, .1))
Now, I will select only the copies which have a least one flanking gene found on the same contig. I consider these the “native” copy of the gene. And I will then make some plots using only these original copies to investigate the geographical distribution of both length and identity with the dim2 copy of Zt10.
temp = dim2_blast_results %>%
group_by(sample) %>%
dplyr::summarize(n_matches = n(),
n_long_matches = sum(length > 1000))
sum_dim2_blast = dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
group_by(sample) %>%
dplyr::summarize(length_sum = sum(length),
pident_wm = weighted.mean(pident, length),
n_matches_on_native_contigs = n()) %>%
filter(length_sum < length_dim2 + 200) %>%
inner_join(., temp) %>%
inner_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file"))
#Plots of identity vs length of the original copy
scatter <- sum_dim2_blast %>%
ggplot(aes(length_sum, pident_wm,
color = Continent,
shape = as.character(n_matches_on_native_contigs))) +
geom_point() +
theme_bw() + Color_Continent +
theme(legend.position = "none") +
labs( x = "Length of the recovered native copy",
y = "Identity with Zt10dim2")
hist_top = sum_dim2_blast %>%
filter(Continent != "Asia") %>%
ggplot(aes(Continent, length_sum,
fill = Continent, color = Continent)) +
geom_boxplot(alpha = 0.6, width = 0.4) +
theme_bw() +
Fill_Continent + Color_Continent+
theme(legend.position = "none",
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 7)) +
coord_flip() +
labs( x = "", y = "")
hist_right = sum_dim2_blast %>%
filter(Continent != "Asia") %>%
ggplot(aes(Continent, pident_wm, fill = Continent)) +
geom_violin(alpha = 0.6) +
theme_bw() + Fill_Continent +
theme(legend.position = "none",
axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
labs( x = "", y = "")
#Options for top right plot
legend_b <- get_legend(
hist_top +
guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "right")
)
empty <- ggplot()+geom_point(aes(1,1), colour="white")+
theme(axis.ticks=element_blank(),
panel.background=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(),
axis.title.x=element_blank(), axis.title.y=element_blank())
#Gather all
cowplot::plot_grid(hist_top, legend_b, scatter, hist_right,
ncol=2, nrow=2,
rel_widths=c(1.2, 1), rel_heights=c(1, 1),
align = 'vh')
sum_dim2_blast %>%
filter(Continent != "Asia") %>%
dplyr::mutate(Nb_frag = as.character(n_matches_on_native_contigs)) %>%
dplyr::count(Nb_frag, Collection, Continent) %>%
ggplot(aes(Collection, Continent, color = Collection)) +
geom_point(aes(size = n), stat = "identity", alpha = 0.6) +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
labs( x = "", y = "") +
Fill_Continent +
facet_wrap(vars(Nb_frag))
Here a large proportion of the Middle-Eastern samples from the old Hartmann dataset have two matches on the “right” contigs: this is due to a deletion found in one of the allele of dim2 (shown in Mareike’s new version of her dim2 manuscript).
Let’s investigate quickly how many long copies there are in each genome. I’m also interested in the native match as related to RIP and geography.
p1 = sum_dim2_blast %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Identity of the native match",
width = 20))
p2 = sum_dim2_blast %>%
ggplot(aes(as.numeric(Composite_median), n_long_matches, color = Continent,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Number of long blast matches (above 1kb)",
width = 20))
p3 = sum_dim2_blast %>%
ggplot(aes(x = pident_wm, y = n_long_matches, color = Continent,
shape = Collection)) +
geom_point() + Color_Continent + theme_bw()+
labs(x = str_wrap("Identity of the native match",
width = 20),
y = str_wrap("Number of long blast matchess (above 1kb)",
width = 20),
color = "Geographical group") +
theme(axis.title=element_text(size=10))
bottom_row <- cowplot::plot_grid(p1, p2, labels = c('B', 'C'), label_size = 12)
cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$Composite_median, method="spearman")
##
## Spearman's rank correlation rho
##
## data: sum_dim2_blast$pident_wm and sum_dim2_blast$Composite_median
## S = 89748704, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2710887
cor.test(sum_dim2_blast$n_long_matches, sum_dim2_blast$Composite_median, method="spearman")
##
## Spearman's rank correlation rho
##
## data: sum_dim2_blast$n_long_matches and sum_dim2_blast$Composite_median
## S = 129899429, p-value = 0.09839
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.05500309
cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$n_long_matches, method="spearman")
##
## Spearman's rank correlation rho
##
## data: sum_dim2_blast$pident_wm and sum_dim2_blast$n_long_matches
## S = 136729196, p-value = 0.0008775
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1104724
cowplot::plot_grid(p3, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)
p1 = sum_dim2_blast %>%
filter(Collection == "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Identity of the native match",
width = 20))
p2 = sum_dim2_blast %>%
filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Identity of the native match",
width = 20))
cowplot::plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)
And then as boxplots per continental region.
p1 = sum_dim2_blast %>%
group_by(Continent) %>%
dplyr::mutate(avg_per_cont = mean(pident_wm, na.rm = T)) %>%
ggplot(aes(Continent, pident_wm,
fill = Continent)) +
geom_violin(aes(fill = Continent), alpha = 0.5) +
geom_jitter(aes(col = Continent), size = 0.8, alpha = 0.8, width = 0.2, height = 0.1) +
Fill_Continent + Color_Continent +
theme_light() +
theme(
panel.border = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "none",
axis.text.x = element_text(size = 10, angle = 40, hjust = 1)) +
coord_flip() +
labs(x = NULL,
y = "Identity of the native copy to Zt10dim2")
p2 = ggplot(sum_dim2_blast, aes(Continent, n_long_matches,
fill = Continent)) +
geom_violin() + Fill_Continent +
theme_light() +
theme(panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1))+
labs(x = NULL,
y = "Number of dim2 copies") + coord_flip()
cowplot::plot_grid(p1, p2, rel_widths = c(1, 1))
# TODO
# Use beginning and end of dim2 gene (100bp) to get gene boundaries on assemblies.
# Update: I tried. But it really does not work better...
#Here is the code of the comparison in case I need to use it later still
#Selecting start and end coordinates which are found on a contig
# with at least one of the flanking genes.
dim2_start = dim2_blast_results_complete %>%
filter(gene == "dim2_start") %>%
full_join(., flank1) %>%
full_join(., flank2) %>%
mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
filter(Nb_flanking_found_2cat == ">1")
dim2_end = dim2_blast_results_complete %>%
filter(gene == "dim2_end") %>%
full_join(., flank1) %>%
full_join(., flank2) %>%
mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
filter(Nb_flanking_found_2cat == ">1")
# Table from start/end
#full_join(dim2_start, dim2_end, by = "sample") %>%
# filter(complete.cases(.)) %>%
# select(sample, dim2_flank1.x, dim2_flank2.x,
# gene.x, sseqid.x, sstart.x, send.x, is_same_contig.x, Nb_flanking_found_2cat.x,
# gene.y, sseqid.y, sstart.y, send.y, is_same_contig.y, Nb_flanking_found_2cat.y) %>%
# left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
# group_by(is_same_contig.x, Continent) %>%
# dplyr::summarise(n()) %>%
# pivot_wider(names_from = Continent, values_from = `n()`)
# Table from the "simple" blast as comparison
#dim2_blast_results %>%
# dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
# left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
# filter(length > 2500) %>%
# dplyr::count(is_same_contig, Continent) %>%
# pivot_wider(names_from = Continent, values_from = n)
#Keep only copies with:
# - both flanking genes
# - one flanking gene but a length of 2500 at least
dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
filter(length > 2500) %>%
dplyr::count(is_same_contig, Continent) %>%
dplyr::group_by(Continent) %>%
dplyr::mutate(Somme_per_collection = sum(n)) %>%
mutate(prop = n*100/Somme_per_collection) %>%
ggplot(aes(x = is_same_contig, y = Continent, fill = prop)) +
geom_tile() +
theme_bw() +
scale_fill_viridis_c() +
labs (title = str_wrap(paste0("Proportion of long dim2 matches ",
"with at least one flanking gene"),
width = 70))
#Write the bed file to extract the sequences
dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
dplyr::filter(length > 2500) %>%
dplyr::mutate(start = ifelse(sstart > send, send, sstart),
end = ifelse(sstart > send, sstart, send)) %>%
dplyr::select(sample, sseqid, start, end) %>%
write_delim(paste0(TE_RIP_dir, "Coordinates_dim2_all_samples.bed"),
col_names = F)
#This command has to be run on the cluster
#while read sample chr start end ; do
# echo -e "${chr}\t${start}\t${end}" > temp.bed ;
# ~/Software/bedtools getfasta \
# -fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta \
# -bed temp.bed | sed "s/>/>${sample}:/" >> \
# /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Native_dim2_all_samples.fasta ;
#done < /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Coordinates_dim2_all_samples.bed
#Run through the website because laziness
# mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output
For the following tree, I used the gene alignment from mafft (online) and created a neighbor-joining tree. The gene sequence from Z. passerinii was used as the outgroup sequence. Then, I attempt to represent both the geographical origin or the samples and the RIP level.
#Prep data to add to tree
temp2 = dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
dplyr::filter(length > 2500) %>%
dplyr::mutate(start = ifelse(sstart > send, send, sstart),
end = ifelse(sstart > send, sstart, send)) %>%
unite(coord, start, end, sep = "-") %>%
dplyr::mutate(no_dot = stringr::str_replace(sseqid, "\\.", "_"))%>%
unite(contig2, sample, no_dot, coord, sep = "_", remove = F)
#Read tree in
#details from the mafft website about the tree
#Size = 237 sequences × 1214 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 100
tree = as_tibble(treeio::read.tree("~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk")) %>%
mutate(label_copy = label) %>%
separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
dplyr::mutate(nb = as.integer(nb)) %>%
dplyr::mutate(label_OG = label) %>%
dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
full_join(., temp2, by = c("contig2")) %>%
dplyr::mutate(label_temp = ifelse(is.na(sample), ifelse(is.na(contig), label, contig), sample)) %>%
unite(col = "label_new", nb, label_temp, sep = "_")
tree2 = as_tibble(treeio::read.tree("~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk"))
temp = tree %>%
dplyr::select(label, node, label_new, Continent, Composite_median, Collection) %>%
mutate(Composite_median = ifelse(is.na(Composite_median), 0, Composite_median)) %>%
filter(!is.na(label)) %>%
mutate(RIP = ifelse(Composite_median >= 1.5, "High", ifelse(Composite_median <= 1, "Low", "Medium")))
#Find the outgroup!
root_node = tree2 %>%
filter(grepl("Zpa63", label)) %>%
dplyr::select(node) %>%
pull()
rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))
p2 <- p + geom_tippoint(aes(color = Continent)) +
theme(legend.position = "right") +
Color_Continent
p3 <- p + geom_tippoint(aes(color = RIP), alpha = 0.5) +
theme(legend.position = "right") +
scale_color_manual(values = c("darkred", "yellow", "orange"))
p4 <- p + geom_tippoint(aes(color = Collection), alpha = 0.5) +
theme(legend.position = "right")
cowplot::plot_grid(p2, p3)
#df = temp %>% dplyr::select(RIP)
#rownames(df) <- temp$label
#gheatmap(p2, df, width=.2) +
# scale_fill_manual(values = c("darkred", "yellow", "orange"), name = "RIP") +
# labs(title = "Gene tree for the dim2 sequence",
# y = "")
Based on these trees, there is some clustering according to geography. Additionally, it looks like the Middle-Eastern samples have either a high or low RIP level on their TE, whereas the level is rather in the middle of the range elsewhere…
If there is a relaxation of RIP, we could expect that TEs would not be the only things impacted but that gene duplications would also be possible more in RIP-relaxed genomes than in RIP-active.
Badet_pan_table = read_tsv(paste0(TE_RIP_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
group_by(isolate, N_genes) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::filter(N_genes <= 10) %>%
ggplot(aes(isolate, Percent, fill = N_genes)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
scale_fill_gradient(low = "#EAEAEA", high = "#294D4A", na.value = NA)
Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
group_by(isolate, N_genes_cat) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes) %>%
ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
group_by(isolate, N_genes_cat) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes) %>%
dplyr::filter(N_genes_cat == 2) %>%
ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
core_duplicated_genes = read_tsv(paste0(depth_per_gene_dir, "Sabina_gcnv_event.filtered.core_chr.per_sample.tsv"))
temp = inner_join(sum_dim2_blast, core_duplicated_genes, by = c("sample" = "Sample")) %>%
filter(M < 150)
p1 = ggplot(temp, aes(as.numeric(Composite_median),
M,
color = Continent,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Number of duplicated genes",
width = 20))
p2 = ggplot(temp, aes(pident_wm,
M,
color = Continent,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("Identity of the native match"),
width = 20),
y = str_wrap("Number of duplicated genes",
width = 20))
legend_b <- get_legend(p1 + theme(legend.position = "right"))
top = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, ncol = 1)
cowplot::plot_grid(top, legend_b, nrow = 1)
ggplot(temp, aes(as.numeric(Composite_median),
M,
color = Continent,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() +
labs(x = str_wrap(paste("RIP composite index", " (median per isolate)"),
width = 20),
y = str_wrap("Number of duplicated genes", width = 20)) +
facet_wrap(vars(Continent), scales = "free")
temp = inner_join(core_duplicated_genes, TE_qty, by = c("Sample" = "ID_file")) %>%
filter(!is.na(Cluster))
filter(temp, M < 100) %>%
ggplot(aes(M, Total_insertions, col = Continent)) +
geom_point()+
Color_Continent +
theme_bw()
filter(temp, M < 100) %>%
ggplot(aes(M, Percent_TE_Reads, col = Continent)) +
geom_point()+
Color_Continent +
theme_bw()
p1 = filter(temp, M < 100) %>%
ggplot(aes(Cluster, M, fill = Cluster)) +
geom_violin()+
geom_boxplot(col = "white", width = .2) +
Fill_Cluster+
theme_bw()
p2 = ggplot(temp, aes(Cluster, D, fill = Cluster)) +
geom_violin()+
geom_boxplot(col = "white", width = .2) +
Fill_Cluster+
theme_bw()
cowplot::plot_grid(p1 + coord_flip() + theme(legend.position = "none"),
p2 + coord_flip() + theme(legend.position = "none"))
#Number of accessory chromsomes
Lthres = 0.25
Hthres = 1.75
# Reading in the summarized data
core_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_sample_core_chr_summary.q30.txt")) %>%
mutate(Median_core = Median)
chrom_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_chromosome_summary.q30.txt")) %>%
left_join(., core_depth %>% dplyr::select(Sample, Median_core)) %>%
mutate(Relative_depth = as.numeric(Median)/(0.001 + as.numeric(Median_core))) %>%
filter(CHROM != "mt") %>%
dplyr::mutate(Depth_is = ifelse(Relative_depth > Hthres, "High",ifelse(Relative_depth < Lthres, "Low", "Normal"))) %>%
dplyr::mutate(PAV = ifelse(as.numeric(Relative_depth) > 0.5, "P", "D"))
temp = chrom_depth %>%
filter(as.numeric(CHROM) > 13) %>%
dplyr::count(Sample, PAV) %>%
pivot_wider(names_from = PAV, values_from = n, values_fill = 0) %>%
dplyr::select(ID_file = Sample, Acc_chr = P)
#Number of duplicated genes
duplicated_loci = core_duplicated_genes %>%
dplyr::select(ID_file = Sample, Duplicated_genes = M) %>%
full_join(., temp)
#Create standardized values for the environment variables
stand = left_join(TE_RIP, duplicated_loci) %>%
dplyr::select(ID_file, Percent_TE_Reads, Composite_median, Composite_mean, Acc_chr, Duplicated_genes, Non_ref_insertions) %>%
pivot_longer(cols = -c(ID_file), names_to = "Variable", values_to = "Value") %>%
group_by(Variable) %>%
dplyr::mutate(Standard_value = scale(Value)) %>%
dplyr::select(-Value) %>%
pivot_wider(names_from = Variable, values_from = Standard_value, values_fn = length)
#The fam file should be same for all core chromosomes, hopefully. So I only have to create one to get the file format including the phenotypes. This can then be used for all chromosomes.
temp2 = left_join(read_tsv(Zt_list, col_names = "ID_file") %>% mutate(ID2 = ID_file),
Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
bind_cols(., tibble(X1 = rep(0, nrow(Zt_meta)), X2 = rep(0, nrow(Zt_meta)), X3 = rep(0, nrow(Zt_meta)))) %>%
inner_join(., stand)
dplyr::select(temp2, -Latitude, -Longitude) %>%
write_delim(file = paste0(TE_RIP_dir, "Phenotypes_for_GWAS.fam"), delim = " ", col_names = F)
#Adding PC info as covariates
left_join(read_tsv(Zt_list, col_names = "ID_file"), read_tsv(paste0(PopStr_dir, vcf_name, ".PCA_results.tab"))) %>%
mutate(Intercept = 1) %>%
dplyr::select(Intercept, PC1, PC2, PC3, PC4) %>%
write_delim(., paste0(TE_RIP_dir, "PC_for_correction.cov"), delim = " ", col_names = F)
#Transfer on the cluster
#rsync -avP ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Phenotypes_for_GWAS.fam alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/Phenotypes_for_GWAS.fam
#rsync -avP ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/PC_for_correction.cov alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/PC_for_correction.cov
#To run gemma on the cluster: conda activate env0
#Commands are then in the format gemma -h
# GWAS
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/output/Results_genomics_characteristics_* ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/2_GWAS/
phenotypes = c("TE", "RIP_med", "RIP_mean", "Accessory_chr", "Duplicated_genes", "TE_insertions")
results_GWAS = list()
for (i in c(1:length(phenotypes))) {
temp = list.files(path = paste0(TE_RIP_dir, "2_GWAS/"),
pattern = paste0("*phenotype_", i,"..subset_ALL.assoc.txt")) %>%
map_df(~read_tsv(file = paste0(paste0(TE_RIP_dir, "2_GWAS/"), .),
col_types = c("dcddccdddddd"))) %>%
unite(col = SNP, chr, ps, sep = "_", remove = F) %>%
dplyr::select(SNP, CHR = chr, BP = ps, P = p_wald, zscore = logl_H1, beta) %>%
mutate(Phenotype = phenotypes[i], Nb = i)
results_GWAS[[phenotypes[i]]] = temp
}
results_GWAS = bind_rows(results_GWAS)
#left_join(TE_RIP, duplicated_loci) %>%
# dplyr::select(ID_file, Percent_TE_Reads, Composite_median, Composite_mean, Acc_chr, Duplicated_genes) %>%
# pivot_longer(cols = -c(ID_file), names_to = "Variable", values_to = "Value")
#Genomic Inflation Factor
kable(results_GWAS %>%
group_by(Phenotype) %>%
summarize(Genomic_Inflation_Factor = median(qchisq(1-P,1))/qchisq(0.5,1)) %>%
arrange(Genomic_Inflation_Factor))
| Phenotype | Genomic_Inflation_Factor |
|---|---|
| Duplicated_genes | 0.8815758 |
| RIP_med | 0.8981546 |
| TE_insertions | 0.9754432 |
| RIP_mean | 1.0226379 |
| Accessory_chr | 1.0530365 |
| TE | 1.0728435 |
#QQ-plots
par(mfrow=c(3,1))
for (i in c(1:length(phenotypes))){
temp = results_GWAS %>% filter(Phenotype == phenotypes[i]) %>% dplyr::sample_frac(size = .10) %>% dplyr::select(P) %>% pull()
GWASTools::qqPlot(temp, thinThreshold = 2)
}
head(results_GWAS)
## # A tibble: 6 x 8
## SNP CHR BP P zscore beta Phenotype Nb
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <int>
## 1 1_111115 1 111115 0.899 -1068. 0.0131 TE 1
## 2 1_111124 1 111124 0.228 -1067. -0.124 TE 1
## 3 1_111127 1 111127 0.232 -1067. -0.123 TE 1
## 4 1_111283 1 111283 0.831 -1068. -0.0113 TE 1
## 5 1_111350 1 111350 0.987 -1068. -0.00201 TE 1
## 6 1_111358 1 111358 0.944 -1068. 0.00935 TE 1
Bonferroni_threshold = results_GWAS %>% dplyr::count(Nb) %>%
mutate(Bonferroni_threshold = 0.05/n) %>% summarize(average = mean(Bonferroni_threshold)) %>% pull()
kable(results_GWAS %>%
filter(P <= Bonferroni_threshold) %>%
dplyr::count(Phenotype) %>%
arrange(n))
| Phenotype | n |
|---|---|
| Accessory_chr | 10 |
| TE | 99 |
| RIP_med | 145 |
| TE_insertions | 193 |
| RIP_mean | 286 |
| Duplicated_genes | 786 |
results_GWAS = results_GWAS %>% mutate(SNP_type = ifelse(P <= Bonferroni_threshold, "significant", "NS"))
significant_TE = results_GWAS %>%
filter(P <= Bonferroni_threshold) %>% group_by(SNP, CHR, BP) %>%
dplyr::mutate(Count = n()) %>% mutate(SNP_type = "significant")%>%
ungroup()
significant_TE %>%
dplyr::select(CHR, BP) %>%
distinct() %>%
write_delim(paste0(TE_RIP_dir, "2_GWAS/Significant_TE.tsv"), delim = "\t", col_names = F)
Identifying the variants which are significant is very useful already, but it is hard to work with so many “independent” positions. It also does not make sense conceptually, as nearby variants are probably picking up the exact same signal through linkage. So, here I gather variants together into “significant loci” if they are no bigger gaps than a certain threshold.
#Distance between two significant SNPs to include them in the same region
len_threshold = 20000L
#For each phenotype and for each chromosome
list_loci = list()
for (j in c(1:length(phenotypes))) {
temp2 = list()
for (i in c(1:13)){
#Pull vector of position for the right chromosome and phenotype
v = filter(significant_TE, CHR == i) %>%
filter(Phenotype == phenotypes[j]) %>%
dplyr::select(CHR, BP) %>%
distinct() %>%
pull(BP) %>% sort()
# Split the vectors into group based on the length threshold
temp = split(v, cumsum(c(TRUE, diff(v) >= len_threshold)))
#print(paste(c(phenotypes[j], i, length(temp)), sep = " "))
temp2[[i]] = temp %>% map_df(enframe, .id = 'Locus_nb') %>%
dplyr::rename(BP = value)%>%
mutate(CHR = i)
}
list_loci[[j]] = bind_rows(temp2)
}
#Adding loci info to the table
significant_TE = bind_rows(list_loci) %>%
full_join(significant_TE, .)
significant_TE %>% dplyr::select(CHR, Locus_nb, Phenotype) %>%
distinct() %>%
dplyr::count(CHR, Phenotype)%>%
pivot_wider(names_from = Phenotype, values_from = n, values_fill = 0)
## # A tibble: 13 x 7
## CHR Duplicated_genes RIP_med RIP_mean TE TE_insertions Accessory_chr
## <dbl> <int> <int> <int> <int> <int> <int>
## 1 1 25 4 0 0 0 0
## 2 2 22 4 3 4 3 0
## 3 3 26 5 0 2 2 0
## 4 4 24 0 0 0 2 1
## 5 5 23 4 2 4 4 0
## 6 6 16 5 3 0 0 0
## 7 7 17 9 7 6 7 0
## 8 8 15 3 0 0 2 0
## 9 9 8 1 0 0 0 0
## 10 10 16 1 0 2 0 0
## 11 11 11 0 0 0 0 0
## 12 12 11 2 0 2 2 0
## 13 13 6 3 3 0 0 0
#rsync -avP ../4_TE_RIP/2_GWAS/Significant_TE.tsv alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/
#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.ann.vcf.gz --positions /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.tsv --out /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE --recode --recode-INFO-all
#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/ANN\n' /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.recode.vcf > /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.tsv
#grep "#CHROM" /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.recode.vcf | cut -f 1,2,4,5,10- | sed 's/#//' > /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.tab
#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.recode.vcf >> /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.tab
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.t* ../4_TE_RIP/2_GWAS/
For each region, I would like to be able to get more information about the variants: what is the distribution of the alleles compared to the phenotype with which this position was significantly associated? What is the minor allele frequency at this locus? Since it is impossible to represent all thousands of associated variants, I chose one position in particular for each significant locus identified previously.
#i=2
min_length_locus = 1000
whatever <- function(min_length_locus, pheno, expected_output){
temp = significant_TE %>%
filter(Phenotype == pheno) %>%
group_by(CHR, Locus_nb) %>%
mutate(Locus_length = 1 + max(BP) - min(BP)) %>%
filter(Locus_length >= min_length_locus) %>%
dplyr::mutate(Rank = rank(P)) %>%
arrange(Rank) %>%
ungroup() %>%
filter(Rank == 1) %>%
dplyr::select(SNP, CHR, BP)
relevant_pheno = TE_RIP %>%
dplyr::select(ID_file, TE = Percent_TE_Reads, RIP_med = Composite_median, RIP_mean = Composite_mean) %>%
pivot_longer(cols = -c(ID_file), names_to = "Phenotype", values_to = "Value") %>%
filter(Phenotype == pheno)
temp = read_delim(paste0(TE_RIP_dir, "2_GWAS/Significant_TE.ann.tab"), na = c("."), delim = "\t") %>%
inner_join(temp, by = c("CHROM" = "CHR", "POS" = "BP")) %>%
pivot_longer(-c(CHROM, POS, REF, ALT, SNP),
names_to = "ID_file", values_to = "Allele") %>%
inner_join(., relevant_pheno) %>%
filter(Allele != "NA")
if (expected_output == "plot"){
temp %>%
ggplot(aes(Value, fill = as.factor(Allele), col = as.factor(Allele))) +
geom_density(alpha = .4) +
facet_wrap(vars(SNP), scales = "free") +
theme_bw() +
labs(title = paste0("Top associated SNPs for each locus and phenotype: ", pheno))}
else{
temp %>% dplyr::count(SNP, Allele) %>%
pivot_wider(names_from = Allele, values_from = n)
}
}
whatever(min_length_locus, phenotypes[1], "plot")
whatever(min_length_locus, phenotypes[1], "table")
## # A tibble: 3 x 3
## SNP `0` `1`
## <chr> <int> <int>
## 1 12_1201012 801 25
## 2 3_3213337 1178 518
## 3 5_89642 808 43
whatever(min_length_locus, phenotypes[2], "plot")
whatever(min_length_locus, phenotypes[2], "table")
## # A tibble: 8 x 3
## SNP `0` `1`
## <chr> <int> <int>
## 1 1_2077079 748 102
## 2 10_704757 823 23
## 3 6_1367971 824 27
## 4 6_820378 504 1196
## 5 7_1183768 736 96
## 6 7_1493135 725 125
## 7 7_558777 791 60
## 8 8_2018754 834 17
#Find genes and corresponding effects
temp = read_tsv(paste0(TE_RIP_dir, "2_GWAS/Significant_TE.ann.tsv"), col_names = c("CHR", "BP", "REF", "ALT", "ANN")) %>%
separate(ANN, sep = ",",
into = c("ANN1", "ANN2", "ANN3", "ANN4", "ANN5", "ANN6", "ANN7", "ANN8", "ANN9", "ANN10", "ANN11",
"ANN12", "ANN13", "ANN14", "ANN15", "ANN16", "ANN17"),
extra = "warn")
signif_ann_TE = temp %>%
pivot_longer(cols = c(-CHR, -BP, -REF, -ALT), names_to = "ANN", values_to = "Annotation") %>%
filter(!is.na(Annotation)) %>%
separate(Annotation, into = c("ALT2", "Variant_type", "Effect_strength", "Gene", "Rest"), sep = "\\|", extra = "merge") %>%
filter(!str_detect(Gene, "-")) %>%
separate(Gene, into = c("temp", "Chrom", "Gene_nb"), remove = FALSE) %>%
dplyr::select(-ANN, -ALT2, -temp) %>%
mutate(Gene_nb = as.numeric(Gene_nb)) %>%
left_join(., read_tsv(effectors_annotation_file) %>%
filter(Sample == "Zt09") %>%
mutate(Gene = str_replace(Protein_ID, "_chr", "")))
# Compute chromosome size and cumulative position of each chromosome
results_GWAS_for_plot <- results_GWAS %>%
group_by(CHR) %>%
dplyr::summarise(chr_len=max(BP)) %>%
mutate(tot=cumsum(chr_len)-chr_len) %>%
dplyr::select(-chr_len)
#EStimate middle of chromosome to center the labels
axisdf = results_GWAS_for_plot %>%
inner_join( ., results_GWAS %>% filter(Nb == 1)) %>%
arrange(CHR, BP) %>%
mutate(BPcum = BP+tot) %>%
group_by(CHR) %>%
dplyr::summarise(center=( max(BPcum) + min(BPcum) ) / 2 )
# Manhattan plot function
Manhattan_one_pheno <- function(pheno, color_duo) {
temp = results_GWAS %>% filter(Phenotype == pheno) %>%
inner_join(., results_GWAS_for_plot) %>%
left_join(., significant_TE %>% dplyr::filter(Phenotype == pheno) ) %>%
mutate(SNP_type = ifelse(is.na(SNP_type), "Other", SNP_type)) %>%
mutate(logP = -log10(P)) %>%
filter(CHR <= 13) %>%
filter(logP > 2)
ggplot(temp, aes(x=BP, y=logP)) +
geom_point( aes(color=SNP_type), alpha = 0.8, size=1.3) +
scale_color_manual(values = color_duo) +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20") +
theme_bw() +
theme( legend.position="none", panel.border = element_blank(),
panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.background = element_rect(colour="white", fill="white"),
plot.margin = unit(c(0, 0, 0, 0), "cm")) +
facet_grid(cols = vars(CHR), scales = "free_x")
}
#Plot from gff
plot_gff <- function(gff_name, chromosome, min_coord, max_coord, genes_to_highlight, col = "black") {
gff = read_tsv(gff_name,
col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot")) %>%
separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
mutate(ID = str_remove(ID, "ID=")) %>%
filter(CHR == chromosome) %>%
filter(Start >= min_coord) %>%
filter(Stop <= max_coord) %>%
mutate(y_value = rank(Start)) %>%
arrange(Start)
## If too many genes, organize them
if (nrow(gff) > 5) {
temp = ceiling(nrow(gff)/5)
gff = gff %>% mutate(y_value = rep(c(1:5), temp)[1:nrow(gff)])
}
## Make plot
c = gff %>%
ggplot() +
geom_segment(mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value), size = 2, col = col) +
geom_text(aes(x = Stop + (max_coord - min_coord)*0.05 , y = y_value, label = ID), size = 2) +
theme_bw() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank(),
axis.title.x = element_blank(), axis.ticks.y = element_blank(),
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank()) +
coord_cartesian(xlim = c(min_coord, max_coord), ylim = c(min(gff$y_value) - 0.5, max(gff$y_value) + 0.5))
## Highlight interesting genes
temp = match(genes_to_highlight, gff$ID)
if ( sum(!is.na(temp)) > 0 ){
c = c + geom_segment(data = filter(gff, ID == genes_to_highlight),
mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value),
size = 3, col = "blue")
}
c
}
temp = tibble(CHR = c(5, 6), Gene = c("rid", "dnmt5"), Pos = c(255000, 2527000))
p = Manhattan_one_pheno("TE", c("#FFD399", "#ff9f1c")) +
labs(y = str_wrap("TE content (-logP)", 25)) +
geom_vline(data = temp, aes(xintercept = Pos), alpha =.4, col = "grey20")
q = Manhattan_one_pheno("TE_insertions", c("#cbf3f0", "#2ec4b6")) +
labs(y = str_wrap("TE insertions (-logP)", 25)) +
geom_vline(data = temp, aes(xintercept = Pos), alpha =.4, col = "grey20") +
geom_text(data = temp, aes(label = Gene, x= Pos),
y = -5, # Set the position of the text to always be at '14.25'
size = 4, angle = 90,
vjust = 0, hjust = 0) +
coord_cartesian(ylim = c(-3.8, 27), # This focuses the x-axis on the range of interest
clip = 'off')
cowplot::plot_grid(p, q, ncol = 1)
Manhattan_one_pheno("RIP_med", c("#cbf3f0", "#2ec4b6")) +
labs(y = str_wrap("RIP (-logP)", 25)) +
geom_vline(data = temp, aes(xintercept = Pos), alpha =.4, col = "grey20") +
geom_text(data = temp, aes(label = Gene, x= Pos),
y = -5, # Set the position of the text to always be at '14.25'
size = 4, angle = 90,
vjust = 0, hjust = 0) +
coord_cartesian(ylim = c(-3.8, 27), # This focuses the x-axis on the range of interest
clip = 'off')
#Set parameters for the plot
#chromosome = 5
#min_coord = 220000
#max_coord = 270000
#chromosome = 13
#min_coord = 800048
#max_coord = 1129645
#chromosome = 3
#min_coord = 3190000
#max_coord = 3218000
chromosome = 5
min_coord = 220000
max_coord = 270000
genes_to_highlight = c("Zt09_model_5_00047")
TE_gff = "~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Cecile_IPO323_refTEs_match.gff"
gene_gff = "~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3"
# Prepare the gene section of the plot
c = plot_gff(TE_gff, chromosome, min_coord, max_coord, genes_to_highlight, "grey50")
d = plot_gff(gene_gff, chromosome, min_coord, max_coord, genes_to_highlight)
# Prepare the GWAS section of the plot for the first phenotype
b = results_GWAS %>% filter(CHR == chromosome) %>%
filter(Phenotype == "RIP_med") %>%
filter(BP >= min_coord) %>%
filter(BP <= max_coord) %>%
mutate(logP = -log10(P)) %>%
ggplot() +
geom_point(aes(x = BP, y = logP, col = logP)) +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20") +
theme_bw() +
labs(title = "GWAS based on RIP composite index median",
subtitle = paste0("The region is on the chromosome ", chromosome,
" from ", min_coord, "bp to ", max_coord, "bp.")) +
theme(axis.title.x = element_blank()) +
coord_cartesian(xlim = c(min_coord, max_coord))
cowplot::plot_grid(b, c, d, ncol = 1, align = 'v', axis = 'lr', rel_heights = c(1, 0.3, 0.3))
# Prepare the second plot for the second phenotype
b = results_GWAS %>% filter(CHR == chromosome) %>%
filter(Phenotype == "TE") %>%
filter(BP >= min_coord) %>%
filter(BP <= max_coord) %>%
mutate(logP = -log10(P)) %>%
ggplot() +
geom_point(aes(x = BP, y = logP, col = logP)) +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20")+
theme_bw() +
labs(title = "GWAS based on proportion of TE reads",
subtitle = paste0("The region is on the chromosome ", chromosome,
" from ", min_coord, "bp to ", max_coord, "bp.")) +
theme(axis.title.x = element_blank()) +
coord_cartesian(xlim = c(min_coord, max_coord))
cowplot::plot_grid(b, d, c, ncol = 1, align = 'v', axis = 'lr', rel_heights = c(1, 0.3, 0.3))
p = Manhattan_one_pheno("Duplicated_genes", c("#FFD399", "#ff9f1c")) +
labs(y = str_wrap("Duplicated genes(-logP)", 25))
q = Manhattan_one_pheno("Accessory_chr", c("#cbf3f0", "#2ec4b6")) +
labs(y = str_wrap("Accessory chromosome number (-logP)", 25)) +
coord_cartesian(ylim = c(-3.8, 27), # This focuses the x-axis on the range of interest
clip = 'off')
cowplot::plot_grid(p, q, ncol = 1)